Load data files
library(dplyr)
# specify the directory where the files are located
# For NOAA machines
# dir_path <- "/Users/enrique.montes/Google Drive/My Drive/GDrive/OCED_AOML/WS_cruises/plankton_imaging/CPICS/TS.Master_selection"
# For personal machine
dir_path <- "~/Library/CloudStorage/GoogleDrive-enriquemontes01@gmail.com/My Drive/GDrive/OCED_AOML/WS_cruises/plankton_imaging/CPICS/TS.Master_selection"
# obtain a list of file names in the directory
file_names <- list.files(path = dir_path, pattern = ".txt", full.names = TRUE)
# loop over each file and import the tables (use this for DATES)
for (file in file_names) {
table_name <- gsub(".txt", "", basename(file)) # get the name of the table from the file name
assign(table_name, read.table(file = file, header = FALSE, sep = "\t") %>%
mutate(date = as.POSIXct(substr(V1, start = 24, stop = 36), format="%Y%m%d_%H%M", tz="UTC")))
}
# # loop over each file and import the tables (use this for STRINGS)
# for (file in file_names) {
# table_name <- gsub(".txt", "", basename(file)) # get the name of the table from the file name
# assign(table_name, read.table(file = file, header = FALSE, sep = "\t"))
# }
Match file name using DATE values
library(tidyverse)
library(lubridate)
dir_path2 <- "~/Library/CloudStorage/GoogleDrive-enriquemontes01@gmail.com/My Drive/GDrive/OCED_AOML/WS_cruises/plankton_imaging/CPICS/ws_cruise_ctd"
file_name <- list.files(path = dir_path2, pattern = "ctd_meta_v3.csv", full.names = TRUE)
ctd_meta <- read.csv(file_name, fill = TRUE)
# USE WITH ctd_meta_v3.csv
dt_list <- as.POSIXct(paste(ctd_meta$year,
sprintf("%02d", ctd_meta$month),
sprintf("%02d", ctd_meta$day),
ctd_meta$time_gmt),
format = "%Y%m%d %I:%M:%S %p",
tz = "UTC")
################################################################################################################
# This section detects short transit times between stations
# Calculate time differences in seconds between consecutive dt_list objects
time_differences <- as.numeric(difftime(dt_list[-1], dt_list[-length(dt_list)], units = "secs"))
# Convert time differences from seconds to minutes
time_differences_mins <- time_differences / 60
# Create a data frame showing the original times and their differences in minutes
time_diff_df <- data.frame(
start_time = dt_list[-length(dt_list)],
end_time = dt_list[-1],
time_difference_mins = time_differences_mins
)
# find CTD time stamps of consecutive stations within less than 20 min. This will identify CTD casts that are close to each other
short_t_idx <- which(time_diff_df$time_difference_mins < 20)
short_timestamps <- dt_list[short_t_idx]
#################################################################################################################
# Create empty data frame to store results
conc_occ_final <- data.frame(date = character(), count = numeric())
# List of class objects to be processed
class_names <- c("Acantharea", "Centric", "Ceratium", "Chaetoceros", "Chaetognaths",
"Chain2", "Chain3", "Ostracods", "Copepods", "Decapods", "Echinoderms",
"Guinardia", "Jellies", "Larvaceans", "Neocalyptrella", "Noctiluca",
"Polychaets", "Pteropods", "Tricho")
# time buffer before and after CTD time in seconds so that CPICS records are matched to CTD times.
start <- 10 * 60
stop <- 10 * 60
# Iterate over dt_list intervals
for (i in 1:length(dt_list)) {
# Initialize a list to store counts for the current interval
counts_list <- list(date = dt_list[i])
# Iterate over each class object and perform subsetting
for (class_name in class_names) {
class_data <- get(paste0("class.", class_name)) # Dynamically get the class data frame
if (i < length(dt_list)) {
# Subsetting for all intervals except the last one
subset_data <- subset(class_data, date >= dt_list[i]-start & date < dt_list[i+1]-stop)
} else {
# Subsetting for the last interval: capture all data greater than or equal to the last dt_list
subset_data <- subset(class_data, date >= dt_list[i]-start)
}
counts_list[[class_name]] <- nrow(subset_data)
}
# Convert counts_list to a data frame and bind it to the result
result <- as.data.frame(counts_list)
conc_occ_final <- rbind(conc_occ_final, result)
}
# Combine with ctd_meta
taxa_meta <- cbind(ctd_meta, conc_occ_final)
################################################################################
# Check for unaccounted CPICS records
# Initialize a list to store unaccounted dates for each class
unaccounted_dates_list <- list()
# Iterate over each class object
for (class_name in class_names) {
# Get the date-time objects from the current class
sel_class_dates <- get(paste0("class.", class_name))$date
# Initialize a logical vector to track whether each class date is accounted for
is_accounted_for <- rep(FALSE, length(sel_class_dates))
# Check each class date against the intervals in dt_list
for (i in 1:length(dt_list)) {
if (i < length(dt_list)) {
# Check all intervals except the last one
interval_start <- dt_list[i] - start
interval_end <- dt_list[i + 1] - stop
} else {
# Last interval captures all data greater than or equal to the last dt_list
interval_start <- dt_list[i] - start
interval_end <- Inf # Effectively no upper bound
}
# Mark class dates that fall within the current interval as accounted for
is_accounted_for <- is_accounted_for | (sel_class_dates >= interval_start & sel_class_dates < interval_end)
}
# Subset the class dates that were not accounted for
unaccounted_sel_class_dates <- sel_class_dates[!is_accounted_for]
# Store the unaccounted dates in the list
unaccounted_dates_list[[class_name]] <- unaccounted_sel_class_dates
}
# Print or view the unaccounted dates for each class
for (class_name in class_names) {
cat("Unaccounted dates for class:", class_name, "\n")
print(unaccounted_dates_list[[class_name]])
cat("\n")
}
Unaccounted dates for class: Acantharea
POSIXct of length 0
Unaccounted dates for class: Centric
POSIXct of length 0
Unaccounted dates for class: Ceratium
POSIXct of length 0
Unaccounted dates for class: Chaetoceros
POSIXct of length 0
Unaccounted dates for class: Chaetognaths
POSIXct of length 0
Unaccounted dates for class: Chain2
POSIXct of length 0
Unaccounted dates for class: Chain3
POSIXct of length 0
Unaccounted dates for class: Ostracods
POSIXct of length 0
Unaccounted dates for class: Copepods
POSIXct of length 0
Unaccounted dates for class: Decapods
POSIXct of length 0
Unaccounted dates for class: Echinoderms
POSIXct of length 0
Unaccounted dates for class: Guinardia
POSIXct of length 0
Unaccounted dates for class: Jellies
POSIXct of length 0
Unaccounted dates for class: Larvaceans
POSIXct of length 0
Unaccounted dates for class: Neocalyptrella
POSIXct of length 0
Unaccounted dates for class: Noctiluca
POSIXct of length 0
Unaccounted dates for class: Polychaets
POSIXct of length 0
Unaccounted dates for class: Pteropods
POSIXct of length 0
Unaccounted dates for class: Tricho
POSIXct of length 0
# Calculate plankton concentration time series per station
# Calculate species concentration (counts/ml) for each species
# # For zooplankton
# taxa_meta_concentration <- taxa_meta %>%
# mutate(across(c(Acantharea,Chaetognaths,Ostracods,Copepods,Decapods,Echinoderms,Jellies,
# Larvaceans,Polychaets,Pteropods), ~ ./total_vol_sampled * 1e6)) %>%
# select(Station, dec_lat, dec_lon, year, month, date, Avg.chl.a..ug.L.,
# temp..degC., salinity, total_vol_sampled,
# Acantharea,Chaetognaths,Ostracods,Copepods,Decapods,Echinoderms,
# Jellies,Larvaceans,Polychaets,Pteropods) %>%
# filter(!is.na(total_vol_sampled))
#
# # # Transform taxa_meta_concentration to long format
# taxa_meta_long <- taxa_meta_concentration %>%
# pivot_longer(cols = c(Acantharea, Chaetognaths,Ostracods,Copepods,Decapods,
# Echinoderms, Jellies, Larvaceans,Polychaets, Pteropods),
# names_to = "species", values_to = "species_concentration")
# For phytoplankton
taxa_meta_concentration <- taxa_meta %>%
mutate(across(c(Centric,Ceratium,Chaetoceros,Chain2,Chain3,Guinardia,Neocalyptrella,
Noctiluca,Tricho), ~ ./total_vol_sampled * 1e6)) %>%
select(Station, dec_lat, dec_lon, year, month, date, Avg.chl.a..ug.L.,
temp..degC., salinity,total_vol_sampled,
Centric,Ceratium,Chaetoceros,Chain2,Chain3,Guinardia,Neocalyptrella,
Noctiluca,Tricho) %>%
filter(!is.na(total_vol_sampled))
# # Transform taxa_meta_concentration to long format
taxa_meta_long <- taxa_meta_concentration %>%
pivot_longer(cols = c(Centric,Ceratium,Chaetoceros,Chain2,Chain3,Guinardia,Neocalyptrella,
Noctiluca,Tricho),
names_to = "species", values_to = "species_concentration")
# Filter the data for Stations
station_list_fk <- c("WS","21/LK","MR","16","18","10","12","9.5","9","7","2")
# station_list_sr <- c("68","65","64","60","58","57.3","57.2","57.1","57","56","55","54","53","51",
# "49","47","45","41","30","31","33")
# station_list_cal <- c("CAL5","CAL4","CAL3","CAL2","CAL1","RP1","RP2","RP3","RP4","GP5","BG4","BG3",
# "BG2","BG1","BG6", "BG7")
# station_list_vl <- c("V1","V2","V3","V4","V5","V6","V7","V8","V9","L1","L3","L5","L7","L9")
# station_list_tb <- c("AMI9","AMI8","AMI7","AMI6","AMI5","AMI4","AMI3","AMI2","AMI1","TB1","TB2",
# "TB3","TB4","TB5","TB10","CW4","CW3","CW2","CW1")
filtered_taxa_meta_long <- taxa_meta_long %>%
filter(Station %in% station_list_fk)
# Without filtering per group of stations but looking at the entire region
# filtered_taxa_meta_long <- taxa_meta_long
# Calculate mean and standard deviation of species concentration for each date
summary_data <- filtered_taxa_meta_long %>%
group_by(year, month, species) %>%
summarise(mean_concentration = mean(species_concentration),
sd_concentration = sd(species_concentration))
`summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
# Combine year and month columns into a single date column
summary_data$date <- as.Date(paste(summary_data$year, summary_data$month, "15", sep = "-"))
# custom_pal_hex2 <- c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628','#f781bf')
custom_pal_hex2 <- c('darkturquoise','red','#4daf4a','slategray4','orange','dodgerblue','purple', "yellow","pink","violet")
# Acantharea = darkturquoise
# Copepods = red
# echinos = #4daf4a
# Jellies = slategray4
# Larvaceans = orange
# Polychaetes = dodgerblue
# Pteropods = purple
# # Check dominance of groups by calculating the overall mean
# test <- summary_data %>% filter(species == "Polychaetes") %>% summarize(avg = mean(mean_concentration))
# mean(test$avg)
# Plot time series of mean+sd planton concentration per group of sites
zz <- ggplot(summary_data, aes(x = date, y = mean_concentration, fill = species)) +
geom_bar(stat = "identity", alpha = 0.7) +
scale_fill_manual(values = custom_pal_hex2) +
# geom_errorbar(aes(ymin = 0, ymax = mean_concentration + sd_concentration),
# position = position_dodge(width = 0.9), width = 0.2) +
labs(x = "Date", y = "Mean Species Concentration", fill = "Species") +
theme_minimal()
zz

# Filter summary_data for selected only
# Acantharea, Copepods, Echinoderms, Jellies, Larvaceans, Chaetognaths, Polychaetes, Pteropods
# Centric,Ceratium,Chaetoceros,Diatoms,Diatoms2,Guinardia,Neocalyptrella,Noctiluca,Trichodesmium
summary_selected <- filtered_taxa_meta_long %>%
filter(species == "Tricho") %>%
mutate(date = as.Date(paste(year, month, "15", sep = "-")))
# Create the time series boxplot
sel_box <- ggplot(summary_selected, aes(x = as.factor(date), y = species_concentration)) +
geom_boxplot(fill = "white", notch = FALSE, outlier.shape = NA) +
geom_violin(fill = "lightgrey", color = "NA", alpha = 0.7) +
# geom_jitter(aes(color = factor(Station), size = temp..degC.), alpha = 0.9) +
geom_jitter(width = 0.25, aes(color = salinity, size = temp..degC.), alpha = 0.6) +
# geom_jitter(width = 0.25, aes(color = salinity), size = 3, alpha = 0.6) +
labs(x = "Date", y = expression("Density (org. m"^"-3"~")")) +
scale_color_viridis_c(name = "Salinity", option = "mako") +
scale_size_continuous(name = "Temperature (°C)",
limits = c(15, 35),
breaks = c(15, 20, 25, 30, 35),
range = c(1, 4)) +
ylim(-5, 30000) +
theme_minimal() +
# theme_ipsum() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
sel_box

# # Plot concentration time series for all selected sites
# kk <- ggplot(summary_selected, aes(x = date, y = species_concentration, shape = factor(Station))) +
# labs(x = "Date", y = "Species concentration (org.m-3)") +
# geom_point(aes(size = temp..degC.), fill = "darkgrey") + # or colour = factor(Station)
# scale_shape_manual(values = c(0, 1, 2, 3, 4, 5, 6))
# kk
# kk <- ggplot(summary_selected, aes(x = date, y = species_concentration, colour = factor(Station))) +
# labs(x = "Date", y = "Species concentration (org.m-3)") +
# geom_point(aes(size = temp..degC.), alpha = 0.7) + # or colour = factor(Station) +
# # geom_smooth(method = "lm", formula = y ~ poly(x, 4), se = FALSE) + # Add polynomial fit line
# scale_colour_brewer(palette = "Set1")
# kk
# # Find the index of the last row with date == '2023-01-15' and add a dummy row for missing dates
# new_row <- tibble(Station = NA, dec_lat = NA, dec_lon = NA, year = NA,
# month = NA, date = as.Date('2023-03-15'), Avg.chl.a..ug.L. = NA,
# temp..degC. = NA, salinity = NA, species = NA, species_concentration = NA)
# last_index <- max(which(summary_selected$date == as.Date('2023-01-15')))
# summary_selected <- bind_rows(
# summary_selected[1:last_index, ],
# new_row,
# summary_selected[(last_index + 1):nrow(summary_selected), ]
# )
# Calculates total plankton concentrations aggregating data from
selected stations (eg FK)
# Calculate plankton concentration for all species summed up
# For Phyto
taxa_meta_concentration_all <- taxa_meta %>%
mutate(total_concentration = (
Centric + Ceratium + Chaetoceros + Chain2 + Chain3 + Guinardia + Neocalyptrella +
Noctiluca + Tricho) / total_vol_sampled * 1e6) %>%
select(Station, dec_lat, dec_lon, year, month, date, Avg.chl.a..ug.L.,
temp..degC., salinity, total_vol_sampled, total_concentration) %>%
filter(!is.na(total_vol_sampled))
# # For Zooplankton
# taxa_meta_concentration_all <- taxa_meta %>%
# mutate(total_concentration = (
# Acantharea + Chaetognaths + Ostracods + Copepods + Decapods + Echinoderms + Jellies +
# Larvaceans + Polychaets + Pteropods) / total_vol_sampled * 1e6) %>%
# select(Station, dec_lat, dec_lon, year, month, date, Avg.chl.a..ug.L.,
# temp..degC., salinity, total_vol_sampled, total_concentration) %>%
# filter(!is.na(total_vol_sampled))
# Filter the data for Stations
station_list_fk <- c("WS","21/LK","MR","16","18","10","12","9.5","9","7","2")
filtered_taxa_conc_all <- taxa_meta_concentration_all %>%
filter(Station %in% station_list_fk)
# Aggregate total concentration and total volume sampled per year and month
aggregated_concentration <- filtered_taxa_conc_all %>%
group_by(year, month) %>%
summarise(
total_counts = sum(total_concentration * total_vol_sampled / 1e6, na.rm = TRUE), # Sum up all counts
total_vol_sampled = sum(total_vol_sampled, na.rm = TRUE), # Sum up all volume sampled
mean_temp_degC = mean(temp..degC., na.rm = TRUE), # Calculate mean temperature
mean_salinity = mean(salinity, na.rm = TRUE), # Calculate mean salinity
mean_chla = mean(Avg.chl.a..ug.L., na.rm = TRUE)
) %>%
ungroup() %>%
mutate(total_concentration = (total_counts / total_vol_sampled) * 1e6) # Recalculate the concentration
`summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
# Create a Date column for plotting purposes
aggregated_concentration <- aggregated_concentration %>%
mutate(year_month = as.Date(paste(year, month, "01", sep = "-")))
# Create the time series plot
concentration_ts <- ggplot(aggregated_concentration, aes(x = year_month)) +
geom_line(aes(y = total_concentration), color = "blue", size = 1) + # Line plot for total concentration
geom_point(aes(y = total_concentration), color = "red", size = 2) + # Points for total concentration
# geom_line(aes(y = (mean_temp_degC - 20) / (35 - 20) * max(total_concentration)),
# color = "orange", size = 1) + # Line plot for temperature (normalized)
labs(title = "Total Phytoplankton Concentration Time Series",
x = "Date",
y = expression("Total Concentration (org/m"^3~")")) +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") + # Show ticks for every month
scale_y_continuous(
name = expression("Total Concentration (org/m"^3~")"), # Primary y-axis label
# sec.axis = sec_axis(~ . * (35 - 20) / max(aggregated_concentration$total_concentration) + 20,
# name = "Temperature (°C)",
# breaks = seq(20, 35, by = 5)) # Secondary y-axis label
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
# axis.title.y.right = element_text(color = "orange"), # Color the secondary y-axis label
# axis.line.y.right = element_line(color = "orange"), # Color the secondary y-axis line
# panel.grid.major = element_blank(), # Remove major grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
# panel.border = element_blank() # Remove panel border if necessary
)
# Print the plot
concentration_ts

# Calculate plankton concentrations per seascape class
spp_df <- taxa_meta[ , c("X8.day.seascapes", "total_vol_sampled", "date",
"Acantharea",
"Copepods",
"Echinoderms",
"Jellies",
"Larvaceans",
"Polychaets",
"Chaetognaths",
"Pteropods")]
# spp_df <- taxa_meta[ , c("X8.day.seascapes", "total_vol_sampled", "date",
# "Ceratium",
# "Chaetoceros",
# "Chain2",
# "Chain3",
# "Guinardia",
# "Neocalyptrella",
# "Tricho")]
# Remove rows with NaN values in X8.day.seascapes and total_vol_sampled
spp_df <- subset(spp_df, !is.na(X8.day.seascapes) & !is.na(total_vol_sampled))
# Calculate total counts per species within each X8.day.seascapes category
spp_count_per_seascape <- spp_df %>%
gather(key = "species", value = "count", -(X8.day.seascapes:date)) %>%
group_by(X8.day.seascapes, species) %>%
summarise(total_count = sum(count)) %>%
ungroup()
`summarise()` has grouped output by 'X8.day.seascapes'. You can override using the `.groups` argument.
# Calculate total volume sampled per X8.day.seascapes category
total_vol_per_seascape <- spp_df %>%
group_by(X8.day.seascapes) %>%
summarise(total_vol_sampled = sum(total_vol_sampled))
# Merge total counts and total volume data frames
spp_concentration <- merge(spp_count_per_seascape, total_vol_per_seascape, by = "X8.day.seascapes")
# Calculate species per cubic meter: might be misleading since it integrates all counts and sampled volumes across cruises to calculate concentrations. It is likely better to use average concentrations as below.
spp_concentration$species_per_cubic_meter <- spp_concentration$total_count / spp_concentration$total_vol_sampled * 1e6
# # Select desired seascapes and reorder categories in X axis
spp_concentration$X8.day.seascapes <- factor(spp_concentration$X8.day.seascapes, levels = c("3", "5", "7", "11", "13", "15","21","27"))
# Filter out seascape class as desired
# exclude_classes <- c(5, 7, 11)
# spp_concentration <- spp_concentration[!spp_concentration$X8.day.seascapes %in% exclude_classes, ]
custom_pal_hex2 <- c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628','#f781bf')
# Create the stack plot
concentration_stackplot <- ggplot(spp_concentration, aes(x = X8.day.seascapes, y = species_per_cubic_meter, fill = species)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = custom_pal_hex2) +
labs(x = "Seascape class", y = "Species concentration (org. m"^"-3"~")") +
theme_minimal() +
theme(axis.text.x = element_text(size = 18), # Set X-axis label font size
axis.text.y = element_text(size = 18)) +
theme(axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 18)) +
theme(legend.text = element_text(size = 16))
concentration_stackplot

# Calculate percentages
spp_concentration_percent <- spp_concentration %>%
group_by(X8.day.seascapes) %>%
mutate(total_concentration = sum(species_per_cubic_meter),
species_percentage = (species_per_cubic_meter / total_concentration) * 100)
# Create the stacked plot
concentration_percent_stackplot <- ggplot(spp_concentration_percent, aes(x = X8.day.seascapes, y = species_percentage, fill = species)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = custom_pal_hex2) +
labs(x = "Seascape class", y = "Relative concentration (%)") +
theme_minimal() +
theme(axis.text.x = element_text(size = 18), # Set X-axis label font size
axis.text.y = element_text(size = 18)) +
theme(axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 18)) +
theme(legend.text = element_text(size = 16))
concentration_percent_stackplot

# Gather columns containing species counts into key-value pairs and calculate concentrations
spp_concentration_long <- spp_df %>%
gather(key = "species", value = "count", -(X8.day.seascapes:date)) %>%
mutate(concentration = (count / total_vol_sampled) * 1e6)
# Calculate the mean concentration and its standard deviation per species and 8X.day.seascapes category. This is likely more representative since the approach will capture high concentration events (high counts in low volumes) that otherwise get filtered out when using an overall integrated sampled volume and total counts for the total concentration.
avg_concentration_per_class <- spp_concentration_long %>%
group_by(species, `X8.day.seascapes`) %>%
summarize(
mean_concentration = mean(concentration, na.rm = TRUE),
sd_concentration = sd(concentration, na.rm = TRUE)
)
`summarise()` has grouped output by 'species'. You can override using the `.groups` argument.
avg_concentration_per_class$X8.day.seascapes <- factor(avg_concentration_per_class$X8.day.seascapes, levels = c("3", "5", "7", "11", "13", "15","21","27"))
avg_stackplot <- ggplot(avg_concentration_per_class, aes(x = X8.day.seascapes, y = mean_concentration, fill = species)) +
geom_bar(stat = "identity") +
# scale_fill_manual(values = custom_pal_hex2) + # use for zooplankton
scale_colour_brewer(palette = "Set2") + # use for phytoplankton
labs(x = "Seascape class", y = "Mean density (org. m"^"-3"~")") +
theme_minimal() +
theme(axis.text.x = element_text(size = 18), # Set X-axis label font size
axis.text.y = element_text(size = 18)) +
theme(axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 18)) +
theme(legend.text = element_text(size = 16))
avg_stackplot

# Calculate percentages
avg_concentration_percent <- avg_concentration_per_class %>%
group_by(`X8.day.seascapes`) %>%
mutate(mean_concentration_percent = mean_concentration / sum(mean_concentration) * 100)
avg_percent_stackplot <- ggplot(avg_concentration_percent, aes(x = X8.day.seascapes, y = mean_concentration_percent, fill = species)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = custom_pal_hex2) + # use for zooplankton
# scale_colour_brewer(palette = "Set2") + # use for phytoplankton
labs(x = "Seascape class", y = "Relative concentration (%)") +
theme_minimal() +
theme(axis.text.x = element_text(size = 18), # Set X-axis label font size
axis.text.y = element_text(size = 18)) +
theme(axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 18)) +
theme(legend.text = element_text(size = 16))
avg_percent_stackplot

Generate plots
library(tidyverse)
library(hrbrthemes)
library(viridis)
# # Load seascape color palette used with Matlab and extract RGB values for observed unique seascapes
# For NOAA machines
# palette_dir <- "/Users/enrique.montes/Library/CloudStorage/GoogleDrive-enriquemontes01@gmail.com/My Drive/GDrive/software/matlab/m_map/seascape_cm"
# For personal machine
palette_dir <- "~/Library/CloudStorage/GoogleDrive-enriquemontes01@gmail.com/My Drive/GDrive/software/matlab/m_map/seascape_cm"
palette_file <- list.files(path = palette_dir, pattern = "cmap1.csv", full.names = TRUE)
palette_df <- read.csv(palette_file, header = FALSE)
colnames(palette_df) <- c("r", "g", "b")
unique_seascapes <- sort(unique(na.omit(ctd_meta$X8.day.seascapes)))
subset_palette_df <- palette_df[unique_seascapes, ]
# set RGB values for the plots
r_vals <- round(subset_palette_df$r * 255, 0)
g_vals <- round(subset_palette_df$g * 255, 0)
b_vals <- round(subset_palette_df$b * 255, 0)
custom_pal <- cbind(r_vals, g_vals, b_vals)
custom_pal_hex <- rgb(custom_pal[, 1], custom_pal[, 2], custom_pal[, 3], maxColorValue=255)
# pal_final <- c(custom_pal_hex[3], custom_pal_hex[4], custom_pal_hex[5], )
# filter out rows with NA values in column seascapes
df_filtered <- taxa_meta[complete.cases(taxa_meta$X8.day.seascapes),]
# Convert the 'x' column to character
df_filtered$X8.day.seascapes <- as.character(df_filtered$X8.day.seascapes)
# # Reorder seascape categories in X axis
df_filtered$X8.day.seascapes <- factor(df_filtered$X8.day.seascapes, levels = c("3", "5", "7", "11", "13", "15","21","27"))
# Define custom colors for each level
custom_colors <- c("3" = custom_pal_hex[1], "5" = custom_pal_hex[2], "7" = custom_pal_hex[3],
"11" = custom_pal_hex[4], "13" = custom_pal_hex[5], "15" = custom_pal_hex[6],
"21" = custom_pal_hex[7], "27" = custom_pal_hex[8])
# Filter out seascape class as desired
# df_filtered <- df_filtered[df_filtered$X8.day.seascapes != "5", ]
# Plot
# See https://cran.r-project.org/web/packages/viridis/vignettes/intro-to-viridis.html for color palette options
pp <- df_filtered %>%
ggplot( aes(x=X8.day.seascapes, y=Avg.chl.a..ug.L., fill=X8.day.seascapes)) +
geom_boxplot() +
# scale_fill_viridis(option="H", discrete = TRUE, alpha=0.6) +
scale_fill_manual(values = custom_colors) +
geom_jitter(color="grey", size=0.8, alpha=0.4) +
labs(x = "Seascape class") +
labs(y = expression("["*Chl-a*"]"~ mu*"g"~L^-1)) +
# labs(y = expression("Salinity")) +
# labs(y = expression(paste("Temperature (", degree, "C) at 1 m depth"))) +
# labs(y = expression("["*DO*"]"~mg~L^-1)) +
# labs(y = expression("["*NO["x"]*"]" ~ mu*"M")) +
# labs(y = expression("["*PO[4]^"3-"*"]" ~ mu*"M")) +
# labs(y = expression("["*NH[4]^"+"*"]" ~ mu*"M")) +
# theme(axis.title.y = element_text(hjust = 1))
# scale_x_discrete(labels= c("Tropical/Subtropical Upwelling",
# "Tropical Seas",
# "Warm, Blooms, High Nuts",
# "Tropical/Subtropical Transition",
# "Temperate Transition")) +
theme_ipsum() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
# ggtitle("A boxplot with jitter") +
# theme(axis.text.x = element_text(angle = 45)) +
ylim(0, 7) +
theme(axis.text.x = element_text(size = 32, family = "Arial"), # Set X-axis label font size
axis.text.y = element_text(size = 32, family = "Arial")) +
theme(axis.title.x = element_text(size = 18, hjust = 0.5, family = "Arial"),
axis.title.y = element_text(size = 18, hjust = 0.5, family = "Arial")) +
theme(legend.text = element_text(size = 32))
pp
Warning: Removed 184 rows containing non-finite values (`stat_boxplot()`).
Warning: Removed 184 rows containing missing values (`geom_point()`).
Create stackplots showing relative frequency using counts only of
plankton taxa per seascape category
# convert abundance columns to relative frequency
# subsets taxa for zooplankton
# df_subset <- df_filtered[ , c("X8.day.seascapes",
# "Acantharea",
# "Copepods",
# "Echinoderms",
# "Jellies",
# "Larvaceans",
# "Polychaetes",
# "Chaetognaths",
# "Pteropods")]
# subsets taxa for phytoplankton
df_subset <- df_filtered[ , c("X8.day.seascapes",
"Ceratium",
"Chaetoceros",
"Chain2",
"Chain3",
"Guinardia",
"Neocalyptrella",
"Tricho")]
# subsets taxa for all species
# df_subset <- df_filtered[ , c("X8.day.seascapes",
# "Ceratium",
# "Chaetoceros",
# "Diatoms",
# "Diatoms2",
# "Guinardia",
# "Neocalyptrella",
# "Trichodesmium",
# "Acantharea",
# "Copepods",
# "Echinoderms",
# "Jellies",
# "Larvaceans",
# "Polychaetes",
# "Chaetognaths",
# "Pteropods")]
# reshape the data to long format
df_long <- tidyr::gather(df_subset, key = "Species", value = "Frequency", -X8.day.seascapes)
# calculate relative frequencies
df_sum <- df_long %>%
group_by(X8.day.seascapes, Species) %>%
summarise(n = sum(Frequency)) %>%
mutate(freq = n / sum(n))
`summarise()` has grouped output by 'X8.day.seascapes'. You can override using the `.groups` argument.
# Seascape class names:
# "Class 11" - Tropical/Subtropical Upwelling
# "Class 15" - Tropical Seas
# "Class 21" - Warm, Blooms, High Nuts
# "Class 3" - Tropical/Subtropical Transition
# "Class 7" - Temperate Transition
# # Select desired seascapes and reorder categories in X axis
df_sum$X8.day.seascapes <- factor(df_sum$X8.day.seascapes, levels = c("3", "5", "7", "11", "13", "15","21","27"))
# Filter out seascape class as desired
exclude_classes <- c(5, 7, 11)
df_sum <- df_sum[!df_sum$X8.day.seascapes %in% exclude_classes, ]
# Use qualitative palettes: https://colorbrewer2.org/#type=qualitative&scheme=Accent&n=7
custom_pal_hex2 <- c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628','#f781bf')
# custom_pal_hex2 <- c('#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e','#e6ab02','#a6761d')
# create the stackplot
qq <- ggplot(df_sum, aes(x = X8.day.seascapes, y = freq, fill = Species)) +
# scale_fill_viridis(option="Set3", discrete = TRUE, alpha=0.8) +
scale_fill_viridis(discrete = TRUE, alpha=0.8) +
# scale_fill_manual(values = custom_pal_hex2) +
geom_bar(stat = "identity") +
labs(x = "Seascape class") +
labs(y = "Relative frequency") +
# scale_x_discrete(labels= c("Tropical/Subtropical Upwelling",
# "Tropical Seas",
# "Warm, Blooms, High Nuts",
# "Tropical/Subtropical Transition",
# "Temperate Transition")) +
# theme(axis.text.x = element_text(angle = 45)) +
theme_minimal() +
theme(axis.text.x = element_text(size = 18), # Set X-axis label font size
axis.text.y = element_text(size = 18)) +
theme(axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 18)) +
theme(legend.text = element_text(size = 16))
#theme(axis.text.x = element_text(hjust = 1))
qq

Compute Shannon Index
library(dplyr)
library(vegan)
Loading required package: permute
Loading required package: lattice
This is vegan 2.6-4
# Select species for Shannon calculation
df_subset_shannon <- df_filtered[ , c("X8.day.seascapes",
"Acantharea",
"Copepods",
"Echinoderms",
"Jellies",
"Larvaceans",
"Polychaets",
"Chaetognaths",
"Pteropods",
"Ceratium",
"Chaetoceros",
"Chain2",
"Chain3",
"Guinardia",
"Neocalyptrella",
"Tricho")]
# reshape the data to long format
df_long <- tidyr::gather(df_subset_shannon, key = "Species", value = "Abundance", -X8.day.seascapes)
# Exclude seascapes
exclude_seascapes <- c(5, 7, 11)
# Compute Shannon diversity per seascape class
shannon_df <- df_long %>%
group_by(X8.day.seascapes) %>%
summarise(shannon = diversity(Abundance, index = "shannon"))
# Filter the data to exclude specified seascapes
shannon_df_filtered <- shannon_df[!shannon_df$X8.day.seascapes %in% exclude_seascapes, ]
# create the bar plot of Shannon diversity
ff <- ggplot(shannon_df_filtered, aes(x = X8.day.seascapes, y = shannon, fill = X8.day.seascapes)) +
geom_bar(stat = "identity") +
# scale_fill_viridis(option="plasma", discrete = TRUE, alpha=0.6) +
scale_fill_manual(values = custom_colors) +
xlab("Seascape Class") +
ylab("Shannon Diversity") +
# scale_x_discrete(labels= c("Tropical/Subtropical Upwelling",
# "Tropical Seas",
# "Warm, Blooms, High Nuts",
# "Tropical/Subtropical Transition",
# "Temperate Transition")) +
# theme(axis.text.x = element_text(angle = 45)) +
theme(axis.text.x = element_text(size = 32), # Set X-axis label font size
axis.text.y = element_text(size = 32)) +
theme(axis.title.x = element_text(size = 32),
axis.title.y = element_text(size = 32)) +
theme(legend.text = element_text(size = 32)) +
guides()
ff

# # Subset the data to exclude specified seascapes
filtered_taxa_meta <- taxa_meta[complete.cases(taxa_meta$X8.day.seascapes), ]
filtered_taxa_meta <- filtered_taxa_meta[!filtered_taxa_meta$X8.day.seascapes %in% exclude_seascapes, ]
# # Plot sampled seascape frequency
tt <- ggplot(filtered_taxa_meta, aes(x = factor(X8.day.seascapes), fill = factor(X8.day.seascapes))) +
geom_bar(color = "black", alpha = 0.7) +
scale_fill_manual(values = custom_colors) +
labs(x = "Seascape Class", y = "Frequency") +
theme_minimal() +
theme(axis.text.x = element_text(size = 22), # Set X-axis label font size
axis.text.y = element_text(size = 22)) +
theme(axis.title.x = element_text(size = 22),
axis.title.y = element_text(size = 22)) +
theme(legend.text = element_text(size = 22)) +
guides(fill = FALSE)
Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as of ggplot2 3.3.4.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
tt

# # Counts of species per seascape
df_sum_abund <- df_long %>%
group_by(X8.day.seascapes, Species) %>%
summarise(n = sum(Abundance))
`summarise()` has grouped output by 'X8.day.seascapes'. You can override using the `.groups` argument.
df_sum_abund_filt <- df_sum_abund[df_sum_abund$Species == "Pteropods",]
# Filter the data to exclude specified seascapes
df_sum_abund_filt2 <- df_sum_abund_filt[!df_sum_abund_filt$X8.day.seascapes %in% exclude_seascapes, ]
dd <- ggplot(df_sum_abund_filt2, aes(x = factor(X8.day.seascapes), y = n, fill = factor(X8.day.seascapes))) +
geom_bar(color = "black", alpha = 0.7, stat = "identity") +
# scale_fill_viridis(option = "magma", discrete = TRUE, alpha = 0.8)
scale_fill_manual(values = custom_colors) +
theme_minimal()
dd

# Compute counts of selected species per seascape class normalized by frequency of seascapes
# reshape the data to long format
df_long2 <- tidyr::gather(df_subset_shannon, key = "Species", value = "Abundance", -X8.day.seascapes)
spp_seascape_normalized <- df_long2 %>%
filter(Species == "Pteropods") %>%
group_by(X8.day.seascapes) %>%
summarise(spp_count = sum(Abundance))
# Compute frequencies of each seascape class
seascape_frequencies <- df_long2 %>%
count(X8.day.seascapes) %>%
rename(freq = n)
# Merge counts with frequencies
spp_seascape_normalized <- merge(spp_seascape_normalized, seascape_frequencies, by = "X8.day.seascapes")
# Normalize counts by frequency
spp_seascape_normalized$spp_normalized <- (spp_seascape_normalized$spp_count / spp_seascape_normalized$freq) * 1000
spp_seascape_normalized_filt <- spp_seascape_normalized[!spp_seascape_normalized$X8.day.seascapes %in% exclude_seascapes, ]
bb <- ggplot(spp_seascape_normalized_filt, aes(x = factor(X8.day.seascapes), y = spp_normalized, fill = factor(X8.day.seascapes))) +
geom_bar(color = "black", alpha = 0.7, stat = "identity") +
scale_fill_manual(values = custom_colors) +
labs(x = "Seascape Class", y = "Counts / Seascape freq * 1000") +
theme_minimal() +
# theme(axis.text.x = element_text(size = 32), # Set X-axis label font size
# axis.text.y = element_text(size = 32)) +
# theme(axis.title.x = element_text(size = 32),
# axis.title.y = element_text(size = 32)) +
# theme(legend.text = element_text(size = 32)) +
guides()
bb

PC plot
library(ggalt)
# Perform principal component analysis on the count data
# for hydrography
sel_vars <- c(
"X8.day.seascapes",
"salinity",
"Avg.chl.a..ug.L.",
"PO4...uM.",
"NO3.NO2..uM.",
"NH4...uM.",
"temp..degC.")
# for taxonomy
# sel_vars <- c("X8.day.seascapes",
# "Acantharea",
# "Copepods",
# "Echinoderms",
# "Jellies",
# "Larvaceans",
# "Polychaets",
# "Chaetognaths",
# "Pteropods")
# sel_vars <- c("X8.day.seascapes",
# "Ceratium",
# "Chaetoceros",
# "Chain2",
# "Chain3",
# "Guinardia",
# "Neocalyptrella",
# "Tricho")
# sel_vars <- c("X8.day.seascapes",
# "Acantharea",
# "Copepods",
# "Echinoderms",
# "Jellies",
# "Larvaceans",
# "Polychaets",
# "Chaetognaths",
# "Pteropods",
# "Ceratium",
# "Chaetoceros",
# "Chain2",
# "Chain3",
# "Guinardia",
# "Neocalyptrella",
# "Tricho")
# filter out rows with NA values in column seascapes
# df_filtered_pca <- taxa_meta[complete.cases(taxa_meta$X8.day.seascapes), ]
df_filtered_pca <- taxa_meta[complete.cases(taxa_meta[, sel_vars]), ]
# exclude_seascapes <- c(5, 7, 11)
exclude_seascapes <- 0
filt_df_pca <- df_filtered_pca[!df_filtered_pca$X8.day.seascapes %in%
exclude_seascapes, sel_vars]
# Select numeric columns in filt_df_pca
numeric_cols <- sapply(filt_df_pca, is.numeric)
# Identify numeric columns except the first one
numeric_cols <- 2:ncol(filt_df_pca)
# Transform numeric columns to log scale
filt_df_pca[numeric_cols] <- lapply(filt_df_pca[numeric_cols], function(x) log(x + 1))
# pca <- prcomp(filt_df_pca[, c("salinity", "Avg.chl.a..ug.L.", "PO4...uM.", "NO3.NO2..uM.")], scale. = TRUE)
pca <- prcomp(filt_df_pca[, -1], scale. = TRUE)
# Extract PC1 and PC2 scores for each sampling event
# pc_scores <- data.frame(seascape = df_subset$X8.day.seascapes, # for taxonomic analysis
pc_scores <- data.frame(seascape = as.character(filt_df_pca$X8.day.seascapes), # for hydrography
PC1 = pca$x[, 1],
PC2 = pca$x[, 2])
# Create the plot
pc_scores$seascape <- factor(pc_scores$seascape, levels = c("3", "5", "7", "11", "13", "15", "21", "27"))
bb <- ggplot(pc_scores, aes(x = PC1, y = PC2, color = seascape)) +
geom_point() +
labs(x = "PC1", y = "PC2", color = "Seascape")
# add circle around cluster of data points
# custom_colors_pca <- custom_pal_hex[c(1, 5, 6, 7, 8)]
custom_colors_pca <- custom_colors
yy <- ggplot(pc_scores, aes(x = PC1, y = PC2, color = seascape)) +
geom_point() +
stat_ellipse(aes(fill = seascape), level = 0.90, geom = "polygon", alpha = 0.3, color = "black") +
scale_color_manual(values = custom_colors_pca) +
scale_fill_manual(values = custom_colors_pca) +
theme_classic() +
xlim(-2,2) +
ylim(-2,2) +
# xlim(-1,1) +
# ylim(-1,1) +
geom_point(size=2) +
guides(colour = guide_legend(override.aes = list(size=2))) +
theme(axis.text.x = element_text(size = 32), # Set X-axis label font size
axis.text.y = element_text(size = 32)) +
theme(axis.title.x = element_text(size = 32),
axis.title.y = element_text(size = 32)) +
theme(legend.text = element_text(size = 32))
yy
Warning: Removed 102 rows containing non-finite values (`stat_ellipse()`).
Too few points to calculate an ellipse
Too few points to calculate an ellipse
Warning: Removed 102 rows containing missing values (`geom_point()`).
Warning: Removed 102 rows containing missing values (`geom_point()`).

################################################################################################
# # Create PCA with eigenvectors
# Extract principal component scores
pc_scores2 <- pca$x
# Extract eigenvectors
eigenvectors <- pca$rotation
# Calculate the percentage variance explained by each principal component
total_variance <- sum(pca$sdev^2)
pc_var_percent <- round(100 * (pca$sdev^2) / total_variance, 1)
# Convert X8.day.seascapes to a factor
filt_df_pca$X8.day.seascapes <- as.factor(filt_df_pca$X8.day.seascapes)
# # For hydrography
qq <- ggplot(filt_df_pca, aes(x = pc_scores2[,1], y = pc_scores2[,2], color = X8.day.seascapes)) +
geom_point(size = 3, alpha = 0.7) +
scale_color_manual(values = custom_colors_pca) +
geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 1], yend = eigenvectors[2, 1]),
arrow = arrow(length = unit(0.1, "inches")), color = "black") + # Add vector for PC1
geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 2], yend = eigenvectors[2, 2]),
arrow = arrow(length = unit(0.1, "inches")), color = "black") + # Add vector for PC2
geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 3], yend = eigenvectors[2, 3]),
arrow = arrow(length = unit(0.1, "inches")), color = "black") + # Add vector for PC3
geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 4], yend = eigenvectors[2, 4]),
arrow = arrow(length = unit(0.1, "inches")), color = "black") + # Add vector for PC4
geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 5], yend = eigenvectors[2, 5]),
arrow = arrow(length = unit(0.1, "inches")), color = "black") + # Add vector for PC5
geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 6], yend = eigenvectors[2, 6]),
arrow = arrow(length = unit(0.1, "inches")), color = "black") + # Add vector for PC6
geom_text(aes(x = eigenvectors[1, 1], y = eigenvectors[2, 1],
label = paste("PC1 (", pc_var_percent[1], "%: Salinity)", sep = "")),
vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC1
geom_text(aes(x = eigenvectors[1, 2], y = eigenvectors[2, 2],
label = paste("PC2 (", pc_var_percent[2], "%: Chla)", sep = "")),
vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC2
geom_text(aes(x = eigenvectors[1, 3], y = eigenvectors[2, 3],
label = paste("PC3 (", pc_var_percent[3], "%: Phosphate)", sep = "")),
vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC3
geom_text(aes(x = eigenvectors[1, 4], y = eigenvectors[2, 4],
label = paste("PC4 (", pc_var_percent[4], "%: NOx)", sep = "")),
vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC4
geom_text(aes(x = eigenvectors[1, 5], y = eigenvectors[2, 5],
label = paste("PC5 (", pc_var_percent[5], "%: Ammonium)", sep = "")),
vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC5
geom_text(aes(x = eigenvectors[1, 6], y = eigenvectors[2, 6],
label = paste("PC6 (", pc_var_percent[6], "%: Temperature)", sep = "")),
vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC6
labs(x = "PC1", y = "PC2", color = "Seascape class") +
xlim(-1.5, 1.5) +
ylim(-1.5, 1.5) +
guides(colour = guide_legend(override.aes = list(size=5))) +
theme_classic() +
theme(axis.text.x = element_text(size = 18), # Set X-axis label font size
axis.text.y = element_text(size = 18)) +
theme(axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 18)) +
theme(legend.text = element_text(size = 18)) +
theme(legend.title = element_text(size = 18))
qq
Warning: Removed 196 rows containing missing values (`geom_point()`).

# # # For phytoplankton
# qq2 <- ggplot(filt_df_pca, aes(x = pc_scores2[,1], y = pc_scores2[,3], color = X8.day.seascapes)) +
# geom_point(size = 4) +
# scale_color_manual(values = custom_colors_pca) +
# geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 1], yend = eigenvectors[2, 1]),
# arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC1
# geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 2], yend = eigenvectors[2, 2]),
# arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC2
# geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 3], yend = eigenvectors[2, 3]),
# arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC3
# geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 4], yend = eigenvectors[2, 4]),
# arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC4
# geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 5], yend = eigenvectors[2, 5]),
# arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC5
# geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 6], yend = eigenvectors[2, 6]),
# arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC6
# geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 7], yend = eigenvectors[2, 7]),
# arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC7
# geom_text(aes(x = eigenvectors[1, 1], y = eigenvectors[2, 1],
# label = paste("PC1 (", pc_var_percent[1], "%: Ceratium)", sep = "")),
# vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC1
# geom_text(aes(x = eigenvectors[1, 2], y = eigenvectors[2, 2],
# label = paste("PC2 (", pc_var_percent[2], "%: Chaetoceros)", sep = "")),
# vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC2
# geom_text(aes(x = eigenvectors[1, 3], y = eigenvectors[2, 3],
# label = paste("PC3 (", pc_var_percent[3], "%: Diatoms)", sep = "")),
# vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC3
# geom_text(aes(x = eigenvectors[1, 4], y = eigenvectors[2, 4],
# label = paste("PC4 (", pc_var_percent[4], "%: Diatoms2)", sep = "")),
# vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC4
# geom_text(aes(x = eigenvectors[1, 5], y = eigenvectors[2, 5],
# label = paste("PC5 (", pc_var_percent[5], "%: Guinardia)", sep = "")),
# vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC5
# geom_text(aes(x = eigenvectors[1, 6], y = eigenvectors[2, 6],
# label = paste("PC6 (", pc_var_percent[6], "%: Neocalyptrella)", sep = "")),
# vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC6
# geom_text(aes(x = eigenvectors[1, 7], y = eigenvectors[2, 7],
# label = paste("PC7 (", pc_var_percent[7], "%: Trichodesmium)", sep = "")),
# vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC7
# labs(x = "PC1", y = "PC3", color = "X8.day.seascapes") +
# xlim(-1, 1) +
# ylim(-1, 1) +
# guides(colour = guide_legend(override.aes = list(size=2))) +
# theme_classic() +
# theme(axis.text.x = element_text(size = 32), # Set X-axis label font size
# axis.text.y = element_text(size = 32)) +
# theme(axis.title.x = element_text(size = 32),
# axis.title.y = element_text(size = 32)) +
# theme(legend.text = element_text(size = 32))
# qq2
#
#
# # # For zooplankton
# qq3 <- ggplot(filt_df_pca, aes(x = pc_scores2[,1], y = pc_scores2[,2], color = X8.day.seascapes)) +
# geom_point(size = 4) +
# scale_color_manual(values = custom_colors_pca) +
# geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 1], yend = eigenvectors[2, 1]),
# arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC1
# geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 2], yend = eigenvectors[2, 2]),
# arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC2
# geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 3], yend = eigenvectors[2, 3]),
# arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC3
# geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 4], yend = eigenvectors[2, 4]),
# arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC4
# geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 5], yend = eigenvectors[2, 5]),
# arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC5
# geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 6], yend = eigenvectors[2, 6]),
# arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC6
# geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 7], yend = eigenvectors[2, 7]),
# arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC7
# geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 8], yend = eigenvectors[2, 8]),
# arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC8
# geom_text(aes(x = eigenvectors[1, 1], y = eigenvectors[2, 1],
# label = paste("PC1 (", pc_var_percent[1], "%: Acantharea)", sep = "")),
# vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC1
# geom_text(aes(x = eigenvectors[1, 2], y = eigenvectors[2, 2],
# label = paste("PC2 (", pc_var_percent[2], "%: Copepods)", sep = "")),
# vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC2
# geom_text(aes(x = eigenvectors[1, 3], y = eigenvectors[2, 3],
# label = paste("PC3 (", pc_var_percent[3], "%: Echinoderms)", sep = "")),
# vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC3
# geom_text(aes(x = eigenvectors[1, 4], y = eigenvectors[2, 4],
# label = paste("PC4 (", pc_var_percent[4], "%: Jellies)", sep = "")),
# vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC4
# geom_text(aes(x = eigenvectors[1, 5], y = eigenvectors[2, 5],
# label = paste("PC5 (", pc_var_percent[5], "%: Larvaceans)", sep = "")),
# vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC5
# geom_text(aes(x = eigenvectors[1, 6], y = eigenvectors[2, 6],
# label = paste("PC6 (", pc_var_percent[6], "%: Polychaetes)", sep = "")),
# vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC6
# geom_text(aes(x = eigenvectors[1, 7], y = eigenvectors[2, 7],
# label = paste("PC7 (", pc_var_percent[7], "%: Chaetognaths)", sep = "")),
# vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC7
# geom_text(aes(x = eigenvectors[1, 8], y = eigenvectors[2, 8],
# label = paste("PC8 (", pc_var_percent[8], "%: Pteropods)", sep = "")),
# vjust = -0.5, hjust = 0.5, color = "black") + # Add label for PC8
# labs(x = "PC1", y = "PC2", color = "X8.day.seascapes") +
# xlim(-1, 1) +
# ylim(-1, 1) +
# guides(colour = guide_legend(override.aes = list(size=2))) +
# theme_classic() +
# theme(axis.text.x = element_text(size = 32), # Set X-axis label font size
# axis.text.y = element_text(size = 32)) +
# theme(axis.title.x = element_text(size = 32),
# axis.title.y = element_text(size = 32)) +
# theme(legend.text = element_text(size = 32))
# qq3
Map overall WEIGHTED mean concentration values of selected taxa by
transect
library(ggOceanMaps)
library(leaflet)
library(patchwork)
# For zooplankton
taxa_meta_concentration <- taxa_meta %>%
mutate(across(c(Acantharea,Chaetognaths,Ostracods,Copepods,Decapods,Echinoderms,Jellies,
Larvaceans,Polychaets,Pteropods), ~ ./total_vol_sampled * 1e6)) %>%
select(Station, dec_lat, dec_lon, year, month, date, Avg.chl.a..ug.L.,
temp..degC., salinity, total_vol_sampled,
Acantharea,Chaetognaths,Ostracods,Copepods,Decapods,Echinoderms,
Jellies,Larvaceans,Polychaets,Pteropods) %>%
filter(!is.na(total_vol_sampled))
# # For phytoplankton
# taxa_meta_concentration <- taxa_meta %>%
# mutate(across(c(Centric,Ceratium,Chaetoceros,Chain2,Chain3,Guinardia,Neocalyptrella,
# Noctiluca,Tricho), ~ ./total_vol_sampled * 1e6)) %>%
# select(Station, dec_lat, dec_lon, year, month, date, Avg.chl.a..ug.L.,
# temp..degC., salinity,total_vol_sampled,
# Centric,Ceratium,Chaetoceros,Chain2,Chain3,Guinardia,Neocalyptrella,
# Noctiluca,Tricho) %>%
# filter(!is.na(total_vol_sampled))
# List of class names to loop through
class_names <- c("Acantharea", "Chaetognaths", "Ostracods", "Copepods", "Decapods", "Echinoderms", "Jellies", "Larvaceans", "Polychaets", "Pteropods")
class_names <- "Acantharea"
path_sfer_list <- "~/Library/CloudStorage/GoogleDrive-enriquemontes01@gmail.com/My Drive/GDrive/proposals/2022_02_MultiStressor_NOAA/module_1"
sfer_curated <- list.files(path_sfer_list, pattern = "sfer_stations_curated.csv", full.names = TRUE)
sfer_sta_list <- read.csv(sfer_curated, header = TRUE)
# Loop over each class name to generate maps
# for (class_name in class_names) {
# # Filter concentration data for the current class and merges curated station list with the concentration df
concentration_df <- taxa_meta_concentration %>%
left_join(sfer_sta_list %>% filter(station_class %in% "C"), by = c('Station' = 'station_id'))
# Compute weighted average lat/lons and mean variable values for the current class
taxa_concentration_avg <- concentration_df %>%
group_by(line_id) %>%
mutate(weight = get(class_name) / sum(get(class_name), na.rm = TRUE)) %>%
summarise(longitude = weighted.mean(dec_lon, w = weight, na.rm = TRUE),
latitude = weighted.mean(dec_lat, w = weight, na.rm = TRUE),
sel_taxa_mean = mean(get(class_name), na.rm = TRUE),
sel_taxa_sd = sd(get(class_name), na.rm = TRUE))
# Prepare data for mapping
dt <- data.frame(
lon = taxa_concentration_avg$longitude,
lat = taxa_concentration_avg$latitude,
mean_param = taxa_concentration_avg$sel_taxa_mean,
sd_param = taxa_concentration_avg$sel_taxa_sd)
# Add station markers
station_markers <- sfer_sta_list %>% filter(station_class %in% "C")
# Create the map
concentration_map <- basemap(limits = c(-86, -79.5, 24, 28.5), bathymetry = TRUE) +
geom_point(data = dt, aes(x = lon, y = lat, size = mean_param, color = sd_param)) +
scale_color_gradient(low = "yellow", high = "red", na.value = NA, name = "Standard Deviation") +
geom_point(data = station_markers, aes(x = mean_lon, y = mean_lat), color = "black", shape = 3, size = 1.5) +
scale_size_continuous(name = "Average concentration",
breaks = c(250, 500, 1000),
range = c(0, 10),
guide = guide_legend(override.aes = list(color = "black", fill = "white"))) +
theme_minimal() +
ggtitle(paste(class_name))
# Optionally, save each map as an image
ggsave(paste0("concentration_map_", class_name, ".png"), plot = concentration_map, width = 8, height = 6)
Warning: Removed 8 rows containing missing values (`geom_point()`).
# }
Map overall mean concentration values of selected taxa per
station

Map mean count values of selected taxa and specified dates
# # ggOceanMaps:
# https://biostats-r.github.io/biostats/workingInR/140_maps.html
# https://github.com/MikkoVihtakari/ggOceanMaps: Use this one: remotes::install_github("MikkoVihtakari/ggOceanMaps")
# install.packages("ggOceanMaps") # # This is outdated, don't use!!!
library(ggOceanMaps)
library(leaflet)
# For NOAA machines
# path_sfer_list <- "/Users/enrique.montes/Library/CloudStorage/GoogleDrive-enriquemontes01@gmail.com/My Drive/GDrive/proposals/2022_02_MultiStressor_NOAA/module_1"
# For personal machine
path_sfer_list <- "~/Library/CloudStorage/GoogleDrive-enriquemontes01@gmail.com/My Drive/GDrive/proposals/2022_02_MultiStressor_NOAA/module_1"
sfer_curated <- list.files(path_sfer_list, pattern = "sfer_stations_curated.csv", full.names = TRUE)
sfer_sta_list <- read.csv(sfer_curated, header = TRUE)
merged_data <- taxa_meta %>%
left_join(sfer_sta_list %>% filter(station_class %in% "C"), by = c('Station' = 'station_id'))
# Replace 2023 and 10 with your selected year and month
selected_year <- 2023
selected_month <- 11
# Filter rows
filtered_taxa_meta <- merged_data %>%
filter(year == selected_year, month == selected_month)
# filtered_taxa_meta <- merged_data
# Compute weighted average lat lons based on selected variable, and mean variable values (e.g., Copepods)
taxa_avg <- filtered_taxa_meta %>%
group_by(line_id) %>%
mutate(weight = Guinardia / sum(Guinardia, na.rm = TRUE)) %>%
summarise(longitude = weighted.mean(dec_lon, w = weight, na.rm = TRUE),
latitude = weighted.mean(dec_lat, w = weight, na.rm = TRUE),
sel_taxa_mean = mean(Guinardia, na.rm = TRUE))
# # Filter out values less than 1
filtered_taxa_avg <- taxa_avg %>%
filter(sel_taxa_mean > 0)
# Find the minimum and maximum values of sel_taxa_mean
min_value <- min(filtered_taxa_avg$sel_taxa_mean, na.rm = TRUE)
max_value <- max(filtered_taxa_avg$sel_taxa_mean, na.rm = TRUE)
# # Adjust the color palette with specified values
# color_palette <- colorNumeric(palette = "Oranges", domain = c(min_value, max_value))
# # Create the map
# map <- leaflet(filtered_taxa_avg) %>%
# addProviderTiles("Esri.WorldImagery") %>%
# addCircleMarkers(
# lng = ~longitude,
# lat = ~latitude,
# radius = 10,
# color = ~color_palette(sel_taxa_mean),
# fillOpacity = 0.8,
# popup = ~paste("Line ID: ", line_id, "<br>Copepods: ", sel_taxa_mean)
# ) %>%
# addLegend(
# position = "bottomright",
# pal = color_palette,
# values = c(min_value, max_value), # Adjusted values here
# title = "Copepod occurrences",
# opacity = 1
# )
# map
# # Map with bathymetry
# Overlay color-scaled dots
dt <- data.frame(
lon = filtered_taxa_avg$longitude,
lat = filtered_taxa_avg$latitude,
sel_param = filtered_taxa_avg$sel_taxa_mean)
# # Colors:
# Acantharea = darkturquoise; breaks = c(0.25, 1, 2.5, 5); limits = c(0.1, 10)
# Copepods = red; breaks = c(0.25, 1, 2.5, 5, 10); limits = c(0.1, 15)
# Chain diatoms = orange; breaks = c(0.5, 1, 10, 30, 60); c(0.1, 70)
# Chaetoceros = purple; breaks = c(0.25, 1, 2.5, 5, 10); limits = c(0.1, 15)
# Echinoderms = magenta; breaks = c(0.5, 1, 3, 5); limits = c(0.1, 5)
# Larvaceans = plum; breaks = c(0.5, 1, 3, 5); limits = c(0.1, 5)
# Polychaetes = dodgerblue; breaks = c(0.25, 0.5, 1, 3); limits = c(0.1, 3)
# Jellies = slategray4; breaks = c(0.25, 0.5, 1, 2); limits = c(0.1, 3)
occ_map <- basemap(limits = c(-84, -79.5, 24, 28.5), bathymetry = TRUE) +
geom_point(data = dt, aes(x = lon, y = lat, size = sel_param), fill = "olivedrab2", color = "olivedrab2") +
geom_point(data = filtered_taxa_meta, aes(x = dec_lon, y = dec_lat), color = "black", shape = 3, size = 1.5) +
scale_size_continuous(name = "Average counts",
breaks = c(0.5, 1, 3, 5, 10),
limits = c(0.1, 40),
range = c(1, 15)) +
theme_minimal()
occ_map
# occ_map_leg <- ggplot(dt, aes(lon, lat)) +
# geom_point(aes(size = sel_param), fill = "green", color ="green") +
# theme_minimal()
# occ_map_leg
Create time-series plots of selected taxa at specific stations
selected_variable <- 'Chaetoceros'
# # Lower Keys
# selected_line_id <- c('LK','WS','MO','KW')
# # Middle Keys
# selected_line_id <- c('CO', 'CR')
# # Other regions
selected_line_id <- c('CAL')
# Filter the data for the selected line_id
filtered_merged_data <- merged_data[merged_data$line_id %in% selected_line_id, ]
# Remove rows with NAs in relevant columns
monthly_means <- na.omit(filtered_merged_data[, c("date", selected_variable, "line_id")]) %>%
group_by(year_month = format(date, "%Y-%m")) %>%
summarise(mean_value = mean(!!sym(selected_variable), na.rm = TRUE),
se_value = sd(!!sym(selected_variable), na.rm = TRUE) / sqrt(n()))
# Create a time series plot
ggplot(monthly_means, aes(x = year_month, y = mean_value)) +
geom_bar(stat = "identity", fill = "orange") +
geom_errorbar(aes(ymin = mean_value - se_value, ymax = mean_value + se_value),
width = 0.2, # Adjust width as needed
position = position_dodge(width = 0.8)) + # Adjust width as needed
labs(x = "Year-Month", y = "Mean Value") +
ggtitle("Monthly Means of Selected Variable") +
theme_minimal()
Match file name with string ID - DO NOT USE
# library(tidyverse)
# library(lubridate)
# library(dplyr)
#
# # extract relevant part of the strings
# df <- rbind(class.Copepods,
# class.Eucampia,
# class.Noctiluca,
# class.Polychaets,
# class.Acantharea,
# class.Centric,
# class.Ceratium,
# class.Chaetoceros,
# class.Chain2,
# class.Chain3,
# class.Chain4,
# class.Ostracods,
# class.Jellies,
# class.Larvaceans,
# class.pellets)
# sub_strings <- substr(df$V1, start = 10, stop = 22)
# unique_all <- unique(sub_strings)
#
# # select unique dates (this allows to search CTD records per date and time)
# # To find unique dates and times to extract CDT data use: unique_all[grepl("20221209", unique_all)]
#
# id_list <- unique_all
# id_list2 <- as.POSIXct(id_list, format="%Y%m%d_%H%M", tz="UTC")
#
# conc_occ_count <- data.frame(date = as.Date(character()), stringsAsFactors = FALSE)
#
# for ( i in seq_along(id_list)){
# acantha <- as.data.frame(str_count(class.Acantharea$V1, id_list[i]))
# centric <- as.data.frame(str_count(class.Centric$V1, id_list[i]))
# ceratium <- as.data.frame(str_count(class.Ceratium$V1, id_list[i]))
# chaetoceros <- as.data.frame(str_count(class.Chaetoceros$V1, id_list[i]))
# chaetog <- as.data.frame(str_count(class.Chaetognaths$V1, id_list[i]))
# chain2 <- as.data.frame(str_count(class.Chain2$V1, id_list[i]))
# chain3 <- as.data.frame(str_count(class.Chain3$V1, id_list[i]))
# chain4 <- as.data.frame(str_count(class.Chain4$V1, id_list[i]))
# ostra <- as.data.frame(str_count(class.Ostracods$V1, id_list[i]))
# copepods <- as.data.frame(str_count(class.Copepods$V1, id_list[i]))
# decapod <- as.data.frame(str_count(class.Decapods$V1, id_list[i]))
# echino <- as.data.frame(str_count(class.Echinoderms$V1, id_list[i]))
# eucampia <- as.data.frame(str_count(class.Eucampia$V1, id_list[i]))
# jellies <- as.data.frame(str_count(class.Jellies$V1, id_list[i]))
# larvae <- as.data.frame(str_count(class.Larvaceans$V1, id_list[i]))
# nocti <- as.data.frame(str_count(class.Noctiluca$V1, id_list[i]))
# polychaetes <- as.data.frame(str_count(class.Polychaets$V1, id_list[i]))
# tricho <- as.data.frame(str_count(class.Tricho$V1, id_list[i]))
#
# Acantharea <- colSums(acantha != 0)
# Centric <- colSums(centric != 0)
# Ceratium_spp <- colSums(ceratium != 0)
# Chaetoceros <- colSums(chaetoceros != 0)
# Chaetognaths <- colSums(chaetog != 0)
# Diatom_chains_1 <- colSums(chain2 != 0)
# Diatom_chains_2 <- colSums(chain3 != 0)
# Diatom_chains_3 <- colSums(chain4 != 0)
# Ostracods <- colSums(ostra != 0)
# Copepods <- colSums(copepods != 0)
# Decapods <- colSums(decapod != 0)
# Echinoderms <- colSums(echino != 0)
# Eucampia_spp <- colSums(eucampia != 0)
# Jellies<- colSums(jellies != 0)
# Larvaceans <- colSums(larvae != 0)
# Noctiluca <- colSums(nocti != 0)
# Polychaetes <- colSums(polychaetes != 0)
# Trichodesmium_spp <- colSums(tricho != 0)
#
# # Parse the date-time string with ymd_hm()
# occ_datetime <- as.POSIXct(id_list[i], format="%Y%m%d_%H%M", tz="UTC")
# occ_datetime_str <- substr(id_list[i], 1, 13)
#
# row_df <- data.frame(date = occ_datetime, occ_datetime_str,
# Acantharea,
# Centric,
# Ceratium_spp,
# Chaetoceros,
# Chaetognaths,
# Diatom_chains_1,
# Diatom_chains_2,
# Diatom_chains_3,
# Ostracods,
# Copepods,
# Decapods,
# Echinoderms,
# Eucampia_spp,
# Jellies,
# Larvaceans,
# Noctiluca,
# Polychaetes,
# Trichodesmium_spp
# )
# rownames(row_df) <- i
#
# conc_occ_count <- rbind(conc_occ_count, row_df)
# }
#
# conc_occ_final <- arrange(conc_occ_count, date)
---
title: "R Notebook: Stringmatch"
output: html_notebook
---


# Load data files
```{r}
library(dplyr)

# specify the directory where the files are located
# For NOAA machines
# dir_path <- "/Users/enrique.montes/Google Drive/My Drive/GDrive/OCED_AOML/WS_cruises/plankton_imaging/CPICS/TS.Master_selection"
# For personal machine
dir_path <- "~/Library/CloudStorage/GoogleDrive-enriquemontes01@gmail.com/My Drive/GDrive/OCED_AOML/WS_cruises/plankton_imaging/CPICS/TS.Master_selection"

# obtain a list of file names in the directory
file_names <- list.files(path = dir_path, pattern = ".txt", full.names = TRUE)

# loop over each file and import the tables (use this for DATES)
for (file in file_names) {
  table_name <- gsub(".txt", "", basename(file)) # get the name of the table from the file name
  assign(table_name, read.table(file = file, header = FALSE, sep = "\t") %>%
           mutate(date = as.POSIXct(substr(V1, start = 24, stop = 36), format="%Y%m%d_%H%M", tz="UTC")))
}

# # loop over each file and import the tables (use this for STRINGS)
# for (file in file_names) {
#   table_name <- gsub(".txt", "", basename(file)) # get the name of the table from the file name
#   assign(table_name, read.table(file = file, header = FALSE, sep = "\t"))
# }

```


# Match file name using DATE values
```{r}
library(tidyverse)
library(lubridate)

dir_path2 <- "~/Library/CloudStorage/GoogleDrive-enriquemontes01@gmail.com/My Drive/GDrive/OCED_AOML/WS_cruises/plankton_imaging/CPICS/ws_cruise_ctd"

file_name <- list.files(path = dir_path2, pattern = "ctd_meta_v3.csv", full.names = TRUE)
ctd_meta <- read.csv(file_name, fill = TRUE)

# USE WITH ctd_meta_v3.csv
dt_list <- as.POSIXct(paste(ctd_meta$year,
                            sprintf("%02d", ctd_meta$month),
                            sprintf("%02d", ctd_meta$day),
                            ctd_meta$time_gmt),
                      format = "%Y%m%d %I:%M:%S %p",
                      tz = "UTC")

################################################################################################################
# This section detects short transit times between stations

# Calculate time differences in seconds between consecutive dt_list objects
time_differences <- as.numeric(difftime(dt_list[-1], dt_list[-length(dt_list)], units = "secs"))

# Convert time differences from seconds to minutes
time_differences_mins <- time_differences / 60

# Create a data frame showing the original times and their differences in minutes
time_diff_df <- data.frame(
  start_time = dt_list[-length(dt_list)],
  end_time = dt_list[-1],
  time_difference_mins = time_differences_mins
)

# find CTD time stamps of consecutive stations within less than 20 min. This will identify CTD casts that are close to each other
short_t_idx <- which(time_diff_df$time_difference_mins < 20)
short_timestamps <- dt_list[short_t_idx]
#################################################################################################################

# Create empty data frame to store results
conc_occ_final <- data.frame(date = character(), count = numeric())

# List of class objects to be processed
class_names <- c("Acantharea", "Centric", "Ceratium", "Chaetoceros", "Chaetognaths", 
                 "Chain2", "Chain3", "Ostracods", "Copepods", "Decapods", "Echinoderms", 
                 "Guinardia", "Jellies", "Larvaceans", "Neocalyptrella", "Noctiluca", 
                 "Polychaets", "Pteropods", "Tricho")

# time buffer before and after CTD time in seconds so that CPICS records are matched to CTD times.
start <- 10 * 60 
stop <- 10 * 60 

# Iterate over dt_list intervals
for (i in 1:length(dt_list)) {
  
  # Initialize a list to store counts for the current interval
  counts_list <- list(date = dt_list[i])
  
  # Iterate over each class object and perform subsetting
  for (class_name in class_names) {
    class_data <- get(paste0("class.", class_name))  # Dynamically get the class data frame
    
    if (i < length(dt_list)) {
      # Subsetting for all intervals except the last one
      subset_data <- subset(class_data, date >= dt_list[i]-start & date < dt_list[i+1]-stop)
    } else {
      # Subsetting for the last interval: capture all data greater than or equal to the last dt_list
      subset_data <- subset(class_data, date >= dt_list[i]-start)
    }
    
    counts_list[[class_name]] <- nrow(subset_data)
  }
  
  # Convert counts_list to a data frame and bind it to the result
  result <- as.data.frame(counts_list)
  conc_occ_final <- rbind(conc_occ_final, result)
} 

# Combine with ctd_meta
taxa_meta <- cbind(ctd_meta, conc_occ_final)

################################################################################
# Check for unaccounted CPICS records 

# Initialize a list to store unaccounted dates for each class
unaccounted_dates_list <- list()

# Iterate over each class object
for (class_name in class_names) {
  # Get the date-time objects from the current class
  sel_class_dates <- get(paste0("class.", class_name))$date
  
  # Initialize a logical vector to track whether each class date is accounted for
  is_accounted_for <- rep(FALSE, length(sel_class_dates))
  
  # Check each class date against the intervals in dt_list
  for (i in 1:length(dt_list)) {
    if (i < length(dt_list)) {
      # Check all intervals except the last one
      interval_start <- dt_list[i] - start
      interval_end <- dt_list[i + 1] - stop
    } else {
      # Last interval captures all data greater than or equal to the last dt_list
      interval_start <- dt_list[i] - start
      interval_end <- Inf  # Effectively no upper bound
    }
    
    # Mark class dates that fall within the current interval as accounted for
    is_accounted_for <- is_accounted_for | (sel_class_dates >= interval_start & sel_class_dates < interval_end)
  }
  
  # Subset the class dates that were not accounted for
  unaccounted_sel_class_dates <- sel_class_dates[!is_accounted_for]
  
  # Store the unaccounted dates in the list
  unaccounted_dates_list[[class_name]] <- unaccounted_sel_class_dates
}

# Print or view the unaccounted dates for each class
for (class_name in class_names) {
  cat("Unaccounted dates for class:", class_name, "\n")
  print(unaccounted_dates_list[[class_name]])
  cat("\n")
}
```

# # Calculate plankton concentration time series per station
```{r}
# Calculate species concentration (counts/ml) for each species

# # For zooplankton
# taxa_meta_concentration <- taxa_meta %>%
#   mutate(across(c(Acantharea,Chaetognaths,Ostracods,Copepods,Decapods,Echinoderms,Jellies,
#     Larvaceans,Polychaets,Pteropods), ~ ./total_vol_sampled * 1e6)) %>%
#   select(Station, dec_lat, dec_lon, year, month, date, Avg.chl.a..ug.L.,
#          temp..degC., salinity, total_vol_sampled,
#          Acantharea,Chaetognaths,Ostracods,Copepods,Decapods,Echinoderms,
#          Jellies,Larvaceans,Polychaets,Pteropods) %>%
#   filter(!is.na(total_vol_sampled))
# 
# # # Transform taxa_meta_concentration to long format
# taxa_meta_long <- taxa_meta_concentration %>%
#   pivot_longer(cols = c(Acantharea, Chaetognaths,Ostracods,Copepods,Decapods,
#                         Echinoderms, Jellies, Larvaceans,Polychaets, Pteropods),
#                names_to = "species", values_to = "species_concentration")

# For phytoplankton
taxa_meta_concentration <- taxa_meta %>%
  mutate(across(c(Centric,Ceratium,Chaetoceros,Chain2,Chain3,Guinardia,Neocalyptrella,
                  Noctiluca,Tricho), ~ ./total_vol_sampled * 1e6)) %>%
  select(Station, dec_lat, dec_lon, year, month, date, Avg.chl.a..ug.L.,
         temp..degC., salinity,total_vol_sampled,
         Centric,Ceratium,Chaetoceros,Chain2,Chain3,Guinardia,Neocalyptrella,
                  Noctiluca,Tricho) %>%
  filter(!is.na(total_vol_sampled))

# # Transform taxa_meta_concentration to long format
taxa_meta_long <- taxa_meta_concentration %>%
  pivot_longer(cols = c(Centric,Ceratium,Chaetoceros,Chain2,Chain3,Guinardia,Neocalyptrella,
                  Noctiluca,Tricho),
               names_to = "species", values_to = "species_concentration")

# Filter the data for Stations
station_list_fk <- c("WS","21/LK","MR","16","18","10","12","9.5","9","7","2")
# station_list_sr <- c("68","65","64","60","58","57.3","57.2","57.1","57","56","55","54","53","51",
#                   "49","47","45","41","30","31","33")
# station_list_cal <- c("CAL5","CAL4","CAL3","CAL2","CAL1","RP1","RP2","RP3","RP4","GP5","BG4","BG3",
#                   "BG2","BG1","BG6", "BG7")
# station_list_vl <- c("V1","V2","V3","V4","V5","V6","V7","V8","V9","L1","L3","L5","L7","L9")
# station_list_tb <- c("AMI9","AMI8","AMI7","AMI6","AMI5","AMI4","AMI3","AMI2","AMI1","TB1","TB2",
#                   "TB3","TB4","TB5","TB10","CW4","CW3","CW2","CW1")
filtered_taxa_meta_long <- taxa_meta_long %>%
  filter(Station %in% station_list_fk)

# Without filtering per group of stations but looking at the entire region
# filtered_taxa_meta_long <- taxa_meta_long

# Calculate mean and standard deviation of species concentration for each date
summary_data <- filtered_taxa_meta_long %>%
  group_by(year, month, species) %>%
  summarise(mean_concentration = mean(species_concentration),
            sd_concentration = sd(species_concentration))

# Combine year and month columns into a single date column
summary_data$date <- as.Date(paste(summary_data$year, summary_data$month, "15", sep = "-"))

# custom_pal_hex2 <- c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628','#f781bf')
custom_pal_hex2 <- c('darkturquoise','red','#4daf4a','slategray4','orange','dodgerblue','purple', "yellow","pink","violet")

# Acantharea = darkturquoise
# Copepods = red
# echinos = #4daf4a
# Jellies = slategray4
# Larvaceans = orange
# Polychaetes = dodgerblue
# Pteropods = purple

# # Check dominance of groups by calculating the overall mean 
# test <- summary_data %>% filter(species == "Polychaetes") %>% summarize(avg = mean(mean_concentration))
# mean(test$avg)

# Plot time series of mean+sd planton concentration per group of sites
zz <- ggplot(summary_data, aes(x = date, y = mean_concentration, fill = species)) +
  geom_bar(stat = "identity", alpha = 0.7) +
  scale_fill_manual(values = custom_pal_hex2) +
  # geom_errorbar(aes(ymin = 0, ymax = mean_concentration + sd_concentration),
  #               position = position_dodge(width = 0.9), width = 0.2) +
  labs(x = "Date", y = "Mean Species Concentration", fill = "Species") +
  theme_minimal()
zz

# Filter summary_data for selected only
# Acantharea, Copepods, Echinoderms, Jellies, Larvaceans, Chaetognaths, Polychaetes, Pteropods
# Centric,Ceratium,Chaetoceros,Diatoms,Diatoms2,Guinardia,Neocalyptrella,Noctiluca,Trichodesmium
summary_selected <- filtered_taxa_meta_long %>%
  filter(species == "Tricho") %>%
  mutate(date = as.Date(paste(year, month, "15", sep = "-")))

# Create the time series boxplot
  sel_box <-  ggplot(summary_selected, aes(x = as.factor(date), y = species_concentration)) +
    geom_boxplot(fill = "white", notch = FALSE, outlier.shape = NA) +
    geom_violin(fill = "lightgrey", color = "NA", alpha = 0.7) +
    # geom_jitter(aes(color = factor(Station), size = temp..degC.), alpha = 0.9) +
    geom_jitter(width = 0.25, aes(color = salinity, size = temp..degC.), alpha = 0.6) +
    # geom_jitter(width = 0.25, aes(color = salinity), size = 3, alpha = 0.6) +
    labs(x = "Date", y = expression("Density (org. m"^"-3"~")")) +
    scale_color_viridis_c(name = "Salinity", option = "mako") +
    scale_size_continuous(name = "Temperature (°C)",
                          limits = c(15, 35),
                          breaks = c(15, 20, 25, 30, 35),
                          range = c(1, 4)) +
    ylim(-5, 30000) +
    theme_minimal() +
    # theme_ipsum() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) 
  sel_box

# # Plot concentration time series for all selected sites
# kk <- ggplot(summary_selected, aes(x = date, y = species_concentration, shape = factor(Station))) +
#   labs(x = "Date", y = "Species concentration (org.m-3)") +
#   geom_point(aes(size = temp..degC.), fill = "darkgrey") + # or colour = factor(Station)
#   scale_shape_manual(values = c(0, 1, 2, 3, 4, 5, 6))
# kk

# kk <- ggplot(summary_selected, aes(x = date, y = species_concentration, colour = factor(Station))) +
#   labs(x = "Date", y = "Species concentration (org.m-3)") +
#   geom_point(aes(size = temp..degC.), alpha = 0.7) + # or colour = factor(Station) +
#   # geom_smooth(method = "lm", formula = y ~ poly(x, 4), se = FALSE) + # Add polynomial fit line
#   scale_colour_brewer(palette = "Set1")
# kk
  
# # Find the index of the last row with date == '2023-01-15' and add a dummy row for missing dates
# new_row <- tibble(Station = NA, dec_lat = NA, dec_lon = NA, year = NA,
#                   month = NA, date = as.Date('2023-03-15'), Avg.chl.a..ug.L. = NA,
#                   temp..degC. = NA, salinity = NA, species = NA, species_concentration = NA)
# last_index <- max(which(summary_selected$date == as.Date('2023-01-15')))
# summary_selected <- bind_rows(
#     summary_selected[1:last_index, ],
#     new_row,
#     summary_selected[(last_index + 1):nrow(summary_selected), ]
#   )
  
```

# # Calculates total plankton concentrations aggregating data from selected stations (eg FK)
```{r}
# Calculate plankton concentration for all species summed up
# For Phyto
taxa_meta_concentration_all <- taxa_meta %>%
  mutate(total_concentration = (
    Centric + Ceratium + Chaetoceros + Chain2 + Chain3 + Guinardia + Neocalyptrella +
    Noctiluca + Tricho) / total_vol_sampled * 1e6) %>%
  select(Station, dec_lat, dec_lon, year, month, date, Avg.chl.a..ug.L.,
         temp..degC., salinity, total_vol_sampled, total_concentration) %>%
  filter(!is.na(total_vol_sampled))

# # For Zooplankton
# taxa_meta_concentration_all <- taxa_meta %>%
#   mutate(total_concentration = (
#     Acantharea + Chaetognaths + Ostracods + Copepods + Decapods + Echinoderms + Jellies +
#     Larvaceans + Polychaets + Pteropods) / total_vol_sampled * 1e6) %>%
#   select(Station, dec_lat, dec_lon, year, month, date, Avg.chl.a..ug.L.,
#          temp..degC., salinity, total_vol_sampled, total_concentration) %>%
#   filter(!is.na(total_vol_sampled))

# Filter the data for Stations
station_list_fk <- c("WS","21/LK","MR","16","18","10","12","9.5","9","7","2")
filtered_taxa_conc_all <- taxa_meta_concentration_all %>%
  filter(Station %in% station_list_fk)

# Aggregate total concentration and total volume sampled per year and month
aggregated_concentration <- filtered_taxa_conc_all %>%
  group_by(year, month) %>%
  summarise(
    total_counts = sum(total_concentration * total_vol_sampled / 1e6, na.rm = TRUE), # Sum up all counts
    total_vol_sampled = sum(total_vol_sampled, na.rm = TRUE), # Sum up all volume sampled
    mean_temp_degC = mean(temp..degC., na.rm = TRUE), # Calculate mean temperature
    mean_salinity = mean(salinity, na.rm = TRUE), # Calculate mean salinity
    mean_chla = mean(Avg.chl.a..ug.L., na.rm = TRUE) 
  ) %>%
  ungroup() %>%
  mutate(total_concentration = (total_counts / total_vol_sampled) * 1e6) # Recalculate the concentration

# Create a Date column for plotting purposes
aggregated_concentration <- aggregated_concentration %>%
  mutate(year_month = as.Date(paste(year, month, "01", sep = "-")))

# Create the time series plot
concentration_ts <- ggplot(aggregated_concentration, aes(x = year_month)) +
  geom_line(aes(y = total_concentration), color = "blue", size = 1) + # Line plot for total concentration
  geom_point(aes(y = total_concentration), color = "red", size = 2) +  # Points for total concentration
  # geom_line(aes(y = (mean_temp_degC - 20) / (35 - 20) * max(total_concentration)), 
  #           color = "orange", size = 1) +  # Line plot for temperature (normalized)
  labs(title = "Total Phytoplankton Concentration Time Series",
       x = "Date",
       y = expression("Total Concentration (org/m"^3~")")) +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +  # Show ticks for every month
  scale_y_continuous(
    name = expression("Total Concentration (org/m"^3~")"),            # Primary y-axis label
    # sec.axis = sec_axis(~ . * (35 - 20) / max(aggregated_concentration$total_concentration) + 20, 
    #                     name = "Temperature (°C)", 
    #                     breaks = seq(20, 35, by = 5))  # Secondary y-axis label
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    # axis.title.y.right = element_text(color = "orange"),  # Color the secondary y-axis label
    # axis.line.y.right = element_line(color = "orange"),    # Color the secondary y-axis line   
    # panel.grid.major = element_blank(),  # Remove major grid lines
    panel.grid.minor = element_blank(),  # Remove minor grid lines
    # panel.border = element_blank()       # Remove panel border if necessary
  )

# Print the plot
concentration_ts
```

# # Calculate plankton concentrations per seascape class
```{r}
spp_df <- taxa_meta[ , c("X8.day.seascapes", "total_vol_sampled", "date",
                         "Acantharea",
                         "Copepods",
                         "Echinoderms",
                         "Jellies",
                         "Larvaceans",
                         "Polychaets",
                         "Chaetognaths",
                         "Pteropods")]

# spp_df <- taxa_meta[ , c("X8.day.seascapes", "total_vol_sampled", "date",
#                          "Ceratium",
#                          "Chaetoceros",
#                          "Chain2",
#                          "Chain3",
#                          "Guinardia",
#                          "Neocalyptrella",
#                          "Tricho")]

# Remove rows with NaN values in X8.day.seascapes and total_vol_sampled
spp_df <- subset(spp_df, !is.na(X8.day.seascapes) & !is.na(total_vol_sampled))

# Calculate total counts per species within each X8.day.seascapes category
spp_count_per_seascape <- spp_df %>%
  gather(key = "species", value = "count", -(X8.day.seascapes:date)) %>%
  group_by(X8.day.seascapes, species) %>%
  summarise(total_count = sum(count)) %>%
  ungroup()

# Calculate total volume sampled per X8.day.seascapes category
total_vol_per_seascape <- spp_df %>%
  group_by(X8.day.seascapes) %>%
  summarise(total_vol_sampled = sum(total_vol_sampled))

# Merge total counts and total volume data frames
spp_concentration <- merge(spp_count_per_seascape, total_vol_per_seascape, by = "X8.day.seascapes")

# Calculate species per cubic meter: might be misleading since it integrates all counts and sampled volumes across cruises to calculate concentrations. It is likely better to use average concentrations as below.
spp_concentration$species_per_cubic_meter <- spp_concentration$total_count / spp_concentration$total_vol_sampled * 1e6

# # Select desired seascapes and reorder categories in X axis
spp_concentration$X8.day.seascapes <- factor(spp_concentration$X8.day.seascapes, levels = c("3", "5", "7", "11", "13", "15","21","27"))

# Filter out seascape class as desired
# exclude_classes <- c(5, 7, 11)
# spp_concentration <- spp_concentration[!spp_concentration$X8.day.seascapes %in% exclude_classes, ]

custom_pal_hex2 <- c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628','#f781bf')

# Create the stack plot
concentration_stackplot <- ggplot(spp_concentration, aes(x = X8.day.seascapes, y = species_per_cubic_meter, fill = species)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = custom_pal_hex2) +
  labs(x = "Seascape class", y = "Species concentration (org. m"^"-3"~")") +
theme_minimal() +
  theme(axis.text.x = element_text(size = 18),  # Set X-axis label font size
        axis.text.y = element_text(size = 18)) +
  theme(axis.title.x = element_text(size = 18),
        axis.title.y = element_text(size = 18)) +
  theme(legend.text = element_text(size = 16))
concentration_stackplot

# Calculate percentages
spp_concentration_percent <- spp_concentration %>%
  group_by(X8.day.seascapes) %>%
  mutate(total_concentration = sum(species_per_cubic_meter),
         species_percentage = (species_per_cubic_meter / total_concentration) * 100)

# Create the stacked plot
concentration_percent_stackplot <- ggplot(spp_concentration_percent, aes(x = X8.day.seascapes, y = species_percentage, fill = species)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = custom_pal_hex2) +
  labs(x = "Seascape class", y = "Relative concentration (%)") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 18),  # Set X-axis label font size
        axis.text.y = element_text(size = 18)) +
  theme(axis.title.x = element_text(size = 18),
        axis.title.y = element_text(size = 18)) +
  theme(legend.text = element_text(size = 16))
concentration_percent_stackplot

# Gather columns containing species counts into key-value pairs and calculate concentrations
spp_concentration_long <- spp_df %>%
  gather(key = "species", value = "count", -(X8.day.seascapes:date)) %>%
  mutate(concentration = (count / total_vol_sampled) * 1e6)

# Calculate the mean concentration and its standard deviation per species and 8X.day.seascapes category. This is likely more representative since the approach will capture high concentration events (high counts in low volumes) that otherwise get filtered out when using an overall integrated sampled volume and total counts for the total concentration.
avg_concentration_per_class <- spp_concentration_long %>%
  group_by(species, `X8.day.seascapes`) %>%
  summarize(
    mean_concentration = mean(concentration, na.rm = TRUE),
    sd_concentration = sd(concentration, na.rm = TRUE)
  )

avg_concentration_per_class$X8.day.seascapes <- factor(avg_concentration_per_class$X8.day.seascapes, levels = c("3", "5", "7", "11", "13", "15","21","27"))

avg_stackplot <- ggplot(avg_concentration_per_class, aes(x = X8.day.seascapes, y = mean_concentration, fill = species)) +
  geom_bar(stat = "identity") +
  # scale_fill_manual(values = custom_pal_hex2) + # use for zooplankton
  scale_colour_brewer(palette = "Set1") + # use for phytoplankton
  labs(x = "Seascape class", y = "Mean density (org. m"^"-3"~")") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 18),  # Set X-axis label font size
        axis.text.y = element_text(size = 18)) +
  theme(axis.title.x = element_text(size = 18),
        axis.title.y = element_text(size = 18)) +
  theme(legend.text = element_text(size = 16))
avg_stackplot

# Calculate percentages
avg_concentration_percent <- avg_concentration_per_class %>%
  group_by(`X8.day.seascapes`) %>%
  mutate(mean_concentration_percent = mean_concentration / sum(mean_concentration) * 100)

avg_percent_stackplot <- ggplot(avg_concentration_percent, aes(x = X8.day.seascapes, y = mean_concentration_percent, fill = species)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = custom_pal_hex2) + # use for zooplankton
  # scale_colour_brewer(palette = "Set2") + # use for phytoplankton
  labs(x = "Seascape class", y = "Relative concentration (%)") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 18),  # Set X-axis label font size
        axis.text.y = element_text(size = 18)) +
  theme(axis.title.x = element_text(size = 18),
        axis.title.y = element_text(size = 18)) +
  theme(legend.text = element_text(size = 16))
avg_percent_stackplot

```


# Generate plots 
```{r}
library(tidyverse)
library(hrbrthemes)
library(viridis)

# # Load seascape color palette used with Matlab and extract RGB values for observed unique seascapes
# For NOAA machines
# palette_dir <- "/Users/enrique.montes/Library/CloudStorage/GoogleDrive-enriquemontes01@gmail.com/My Drive/GDrive/software/matlab/m_map/seascape_cm"
# For personal machine
palette_dir <- "~/Library/CloudStorage/GoogleDrive-enriquemontes01@gmail.com/My Drive/GDrive/software/matlab/m_map/seascape_cm"
palette_file <- list.files(path = palette_dir, pattern = "cmap1.csv", full.names = TRUE)
palette_df <- read.csv(palette_file, header = FALSE)
colnames(palette_df) <- c("r", "g", "b")
unique_seascapes <- sort(unique(na.omit(ctd_meta$X8.day.seascapes)))
subset_palette_df <- palette_df[unique_seascapes, ]

# set RGB values for the plots
r_vals <- round(subset_palette_df$r * 255, 0)
g_vals <- round(subset_palette_df$g * 255, 0)
b_vals <- round(subset_palette_df$b * 255, 0)
custom_pal <- cbind(r_vals, g_vals, b_vals)
custom_pal_hex <- rgb(custom_pal[, 1], custom_pal[, 2], custom_pal[, 3], maxColorValue=255)
# pal_final <- c(custom_pal_hex[3], custom_pal_hex[4], custom_pal_hex[5], )

# filter out rows with NA values in column seascapes
df_filtered <- taxa_meta[complete.cases(taxa_meta$X8.day.seascapes),]

# Convert the 'x' column to character
df_filtered$X8.day.seascapes <- as.character(df_filtered$X8.day.seascapes)

# # Reorder seascape categories in X axis
df_filtered$X8.day.seascapes <- factor(df_filtered$X8.day.seascapes, levels = c("3", "5", "7", "11", "13", "15","21","27"))

# Define custom colors for each level
custom_colors <- c("3" = custom_pal_hex[1], "5" = custom_pal_hex[2], "7" = custom_pal_hex[3],
                   "11" = custom_pal_hex[4], "13" = custom_pal_hex[5], "15" = custom_pal_hex[6],
                   "21" = custom_pal_hex[7], "27" = custom_pal_hex[8])

# Filter out seascape class as desired
# df_filtered <- df_filtered[df_filtered$X8.day.seascapes != "5", ]

# Plot
# See https://cran.r-project.org/web/packages/viridis/vignettes/intro-to-viridis.html for color palette options
pp <- df_filtered %>%
  ggplot( aes(x=X8.day.seascapes, y=Avg.chl.a..ug.L., fill=X8.day.seascapes)) +
    geom_boxplot() +
    # scale_fill_viridis(option="H", discrete = TRUE, alpha=0.6) +
    scale_fill_manual(values = custom_colors) +
    geom_jitter(color="grey", size=0.8, alpha=0.4) +
    labs(x = "Seascape class") +
    labs(y = expression("["*Chl-a*"]"~ mu*"g"~L^-1)) +
    # labs(y = expression("Salinity")) +
    # labs(y = expression(paste("Temperature (", degree, "C) at 1 m depth"))) +
    # labs(y = expression("["*DO*"]"~mg~L^-1)) +
    # labs(y = expression("["*NO["x"]*"]" ~ mu*"M")) +
    # labs(y = expression("["*PO[4]^"3-"*"]" ~ mu*"M")) +
    # labs(y = expression("["*NH[4]^"+"*"]" ~ mu*"M")) +
    # theme(axis.title.y = element_text(hjust = 1))
    # scale_x_discrete(labels= c("Tropical/Subtropical Upwelling", 
    #                          "Tropical Seas", 
    #                          "Warm, Blooms, High Nuts", 
    #                          "Tropical/Subtropical Transition", 
    #                          "Temperate Transition")) +
    theme_ipsum() +
    theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) +
    # ggtitle("A boxplot with jitter") +
    # theme(axis.text.x = element_text(angle = 45)) +
  ylim(0, 7) +
  theme(axis.text.x = element_text(size = 32, family = "Arial"),  # Set X-axis label font size
          axis.text.y = element_text(size = 32, family = "Arial")) +
    theme(axis.title.x = element_text(size = 18, hjust = 0.5, family = "Arial"),
          axis.title.y = element_text(size = 18, hjust = 0.5, family = "Arial")) +
    theme(legend.text = element_text(size = 32)) 
pp
```

# Create stackplots showing relative frequency using counts only of plankton taxa per seascape category
```{r}
# convert abundance columns to relative frequency

# subsets taxa for zooplankton
# df_subset <- df_filtered[ , c("X8.day.seascapes",
#                               "Acantharea",
#                               "Copepods",
#                               "Echinoderms",
#                               "Jellies",
#                               "Larvaceans",
#                               "Polychaetes",
#                               "Chaetognaths",
#                               "Pteropods")]

# subsets taxa for phytoplankton
df_subset <- df_filtered[ , c("X8.day.seascapes",
                            "Ceratium",
                            "Chaetoceros",
                            "Chain2",
                            "Chain3",
                            "Guinardia",
                            "Neocalyptrella",
                            "Tricho")]

# subsets taxa for all species
# df_subset <- df_filtered[ , c("X8.day.seascapes",
#                             "Ceratium",
#                             "Chaetoceros",
#                             "Diatoms",
#                             "Diatoms2",
#                             "Guinardia",
#                             "Neocalyptrella",
#                             "Trichodesmium",
#                             "Acantharea",
#                             "Copepods",
#                             "Echinoderms",
#                             "Jellies",
#                             "Larvaceans",
#                             "Polychaetes",
#                             "Chaetognaths",
#                             "Pteropods")]

# reshape the data to long format 
df_long <- tidyr::gather(df_subset, key = "Species", value = "Frequency", -X8.day.seascapes)

# calculate relative frequencies
df_sum <- df_long %>%
  group_by(X8.day.seascapes, Species) %>%
  summarise(n = sum(Frequency)) %>%
  mutate(freq = n / sum(n))

# Seascape class names:
# "Class 11" - Tropical/Subtropical Upwelling
# "Class 15" - Tropical Seas
# "Class 21" - Warm, Blooms, High Nuts
# "Class 3" - Tropical/Subtropical Transition
# "Class 7" - Temperate Transition

# # Select desired seascapes and reorder categories in X axis
df_sum$X8.day.seascapes <- factor(df_sum$X8.day.seascapes, levels = c("3", "5", "7", "11", "13", "15","21","27"))

# Filter out seascape class as desired
exclude_classes <- c(5, 7, 11)
df_sum <- df_sum[!df_sum$X8.day.seascapes %in% exclude_classes, ]

# Use qualitative palettes: https://colorbrewer2.org/#type=qualitative&scheme=Accent&n=7
custom_pal_hex2 <- c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628','#f781bf')
# custom_pal_hex2 <- c('#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e','#e6ab02','#a6761d')

# create the stackplot
qq <- ggplot(df_sum, aes(x = X8.day.seascapes, y = freq, fill = Species)) +
  # scale_fill_viridis(option="Set3", discrete = TRUE, alpha=0.8) +
  scale_fill_viridis(discrete = TRUE, alpha=0.8) +
  # scale_fill_manual(values = custom_pal_hex2) +
  geom_bar(stat = "identity") +
  labs(x = "Seascape class") +
  labs(y = "Relative frequency") +
  # scale_x_discrete(labels= c("Tropical/Subtropical Upwelling", 
  #                            "Tropical Seas", 
  #                            "Warm, Blooms, High Nuts", 
  #                            "Tropical/Subtropical Transition", 
  #                            "Temperate Transition")) +
  # theme(axis.text.x = element_text(angle = 45)) +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 18),  # Set X-axis label font size
        axis.text.y = element_text(size = 18)) +
  theme(axis.title.x = element_text(size = 18),
        axis.title.y = element_text(size = 18)) +
  theme(legend.text = element_text(size = 16)) 
  #theme(axis.text.x = element_text(hjust = 1))
qq
```


# Compute Shannon Index
```{r}
library(dplyr)
library(vegan)

# Select species for Shannon calculation
df_subset_shannon <- df_filtered[ , c("X8.day.seascapes",
                              "Acantharea",
                              "Copepods",
                              "Echinoderms",
                              "Jellies",
                              "Larvaceans",
                              "Polychaets",
                              "Chaetognaths",
                              "Pteropods",
                              "Ceratium",
                              "Chaetoceros",
                              "Chain2",
                              "Chain3",
                              "Guinardia",
                              "Neocalyptrella",
                              "Tricho")]

# reshape the data to long format
df_long <- tidyr::gather(df_subset_shannon, key = "Species", value = "Abundance", -X8.day.seascapes)

# Exclude seascapes
exclude_seascapes <- c(5, 7, 11)

# Compute Shannon diversity per seascape class
shannon_df <- df_long %>% 
  group_by(X8.day.seascapes) %>% 
  summarise(shannon = diversity(Abundance, index = "shannon"))

# Filter the data to exclude specified seascapes
shannon_df_filtered <- shannon_df[!shannon_df$X8.day.seascapes %in% exclude_seascapes, ]

# create the bar plot of Shannon diversity
ff <- ggplot(shannon_df_filtered, aes(x = X8.day.seascapes, y = shannon, fill = X8.day.seascapes)) +
  geom_bar(stat = "identity") +
  # scale_fill_viridis(option="plasma", discrete = TRUE, alpha=0.6) +
  scale_fill_manual(values = custom_colors) +
  xlab("Seascape Class") +
  ylab("Shannon Diversity") +
  # scale_x_discrete(labels= c("Tropical/Subtropical Upwelling", 
  #                            "Tropical Seas", 
  #                            "Warm, Blooms, High Nuts", 
  #                            "Tropical/Subtropical Transition", 
  #                            "Temperate Transition")) +
  # theme(axis.text.x = element_text(angle = 45)) +
  theme(axis.text.x = element_text(size = 32),  # Set X-axis label font size
        axis.text.y = element_text(size = 32)) +
  theme(axis.title.x = element_text(size = 32),
        axis.title.y = element_text(size = 32)) +
  theme(legend.text = element_text(size = 32)) +
  guides()
ff

# # Subset the data to exclude specified seascapes
filtered_taxa_meta <- taxa_meta[complete.cases(taxa_meta$X8.day.seascapes), ]
filtered_taxa_meta <- filtered_taxa_meta[!filtered_taxa_meta$X8.day.seascapes %in% exclude_seascapes, ]

# # Plot sampled seascape frequency
tt <- ggplot(filtered_taxa_meta, aes(x = factor(X8.day.seascapes), fill = factor(X8.day.seascapes))) +
  geom_bar(color = "black", alpha = 0.7) +
  scale_fill_manual(values = custom_colors) +
  labs(x = "Seascape Class", y = "Frequency") +
  theme_minimal() +
    theme(axis.text.x = element_text(size = 22),  # Set X-axis label font size
        axis.text.y = element_text(size = 22)) +
  theme(axis.title.x = element_text(size = 22),
        axis.title.y = element_text(size = 22)) +
  theme(legend.text = element_text(size = 22)) +
  guides(fill = FALSE)
tt

# # Counts of species per seascape
df_sum_abund <- df_long %>%
  group_by(X8.day.seascapes, Species) %>%
  summarise(n = sum(Abundance)) 

df_sum_abund_filt <- df_sum_abund[df_sum_abund$Species == "Pteropods",]
# Filter the data to exclude specified seascapes
df_sum_abund_filt2 <- df_sum_abund_filt[!df_sum_abund_filt$X8.day.seascapes %in% exclude_seascapes, ]

dd <- ggplot(df_sum_abund_filt2, aes(x = factor(X8.day.seascapes), y = n, fill = factor(X8.day.seascapes))) +
  geom_bar(color = "black", alpha = 0.7, stat = "identity") +
  # scale_fill_viridis(option = "magma", discrete = TRUE, alpha = 0.8)
  scale_fill_manual(values = custom_colors) +
  theme_minimal() 
dd

# Compute counts of selected species per seascape class normalized by frequency of seascapes
# reshape the data to long format
df_long2 <- tidyr::gather(df_subset_shannon, key = "Species", value = "Abundance", -X8.day.seascapes)

spp_seascape_normalized <- df_long2 %>%
  filter(Species == "Pteropods") %>%
  group_by(X8.day.seascapes) %>%
  summarise(spp_count = sum(Abundance))

# Compute frequencies of each seascape class
seascape_frequencies <- df_long2 %>%
  count(X8.day.seascapes) %>%
  rename(freq = n)

# Merge counts with frequencies
spp_seascape_normalized <- merge(spp_seascape_normalized, seascape_frequencies, by = "X8.day.seascapes")

# Normalize counts by frequency
spp_seascape_normalized$spp_normalized <- (spp_seascape_normalized$spp_count / spp_seascape_normalized$freq) * 1000
spp_seascape_normalized_filt <- spp_seascape_normalized[!spp_seascape_normalized$X8.day.seascapes %in% exclude_seascapes, ]

bb <- ggplot(spp_seascape_normalized_filt, aes(x = factor(X8.day.seascapes), y = spp_normalized, fill = factor(X8.day.seascapes))) +
  geom_bar(color = "black", alpha = 0.7, stat = "identity") +
  scale_fill_manual(values = custom_colors) +
  labs(x = "Seascape Class", y = "Counts / Seascape freq * 1000") +
  theme_minimal() +
  #   theme(axis.text.x = element_text(size = 32),  # Set X-axis label font size
  #       axis.text.y = element_text(size = 32)) +
  # theme(axis.title.x = element_text(size = 32),
  #       axis.title.y = element_text(size = 32)) +
  # theme(legend.text = element_text(size = 32)) +
  guides()
bb

```

# PC plot
```{r}
library(ggalt)

# Perform principal component analysis on the count data

# for hydrography
sel_vars <- c(
  "X8.day.seascapes",
  "salinity",
  "Avg.chl.a..ug.L.",
  "PO4...uM.",
  "NO3.NO2..uM.",
  "NH4...uM.",
  "temp..degC.")

# for taxonomy
# sel_vars <- c("X8.day.seascapes",
#               "Acantharea",
#               "Copepods",
#               "Echinoderms",
#               "Jellies",
#               "Larvaceans",
#               "Polychaets",
#               "Chaetognaths",
#               "Pteropods")

# sel_vars <- c("X8.day.seascapes",
#               "Ceratium",
#               "Chaetoceros",
#               "Chain2",
#               "Chain3",
#               "Guinardia",
#               "Neocalyptrella",
#               "Tricho")

# sel_vars <- c("X8.day.seascapes",
#               "Acantharea",
#               "Copepods",
#               "Echinoderms",
#               "Jellies",
#               "Larvaceans",
#               "Polychaets",
#               "Chaetognaths",
#               "Pteropods",
#               "Ceratium",
#               "Chaetoceros",
#               "Chain2",
#               "Chain3",
#               "Guinardia",
#               "Neocalyptrella",
#               "Tricho")

# filter out rows with NA values in column seascapes
# df_filtered_pca <- taxa_meta[complete.cases(taxa_meta$X8.day.seascapes), ]
df_filtered_pca <- taxa_meta[complete.cases(taxa_meta[, sel_vars]), ]
# exclude_seascapes <- c(5, 7, 11)
exclude_seascapes <- 0
filt_df_pca <- df_filtered_pca[!df_filtered_pca$X8.day.seascapes %in%
                                 exclude_seascapes, sel_vars]

# Select numeric columns in filt_df_pca
numeric_cols <- sapply(filt_df_pca, is.numeric)
# Identify numeric columns except the first one
numeric_cols <- 2:ncol(filt_df_pca)

# Transform numeric columns to log scale
filt_df_pca[numeric_cols] <- lapply(filt_df_pca[numeric_cols], function(x) log(x + 1))


# pca <- prcomp(filt_df_pca[, c("salinity", "Avg.chl.a..ug.L.", "PO4...uM.", "NO3.NO2..uM.")], scale. = TRUE)
pca <- prcomp(filt_df_pca[, -1], scale. = TRUE)

# Extract PC1 and PC2 scores for each sampling event
# pc_scores <- data.frame(seascape = df_subset$X8.day.seascapes, # for taxonomic analysis 
pc_scores <- data.frame(seascape = as.character(filt_df_pca$X8.day.seascapes), # for hydrography
                        PC1 = pca$x[, 1], 
                        PC2 = pca$x[, 2])

# Create the plot
pc_scores$seascape <- factor(pc_scores$seascape, levels = c("3", "5", "7", "11", "13", "15", "21", "27"))
bb <- ggplot(pc_scores, aes(x = PC1, y = PC2, color = seascape)) + 
  geom_point() +
  labs(x = "PC1", y = "PC2", color = "Seascape") 

# add circle around cluster of data points
# custom_colors_pca <- custom_pal_hex[c(1, 5, 6, 7, 8)]
custom_colors_pca <- custom_colors

yy <- ggplot(pc_scores, aes(x = PC1, y = PC2, color = seascape)) + 
  geom_point() +
  stat_ellipse(aes(fill = seascape), level = 0.90, geom = "polygon", alpha = 0.3, color = "black") +
  scale_color_manual(values = custom_colors_pca) +
  scale_fill_manual(values = custom_colors_pca) +
  theme_classic() +
  xlim(-2,2) +
  ylim(-2,2) +
  # xlim(-1,1) +
  # ylim(-1,1) +
  geom_point(size=2) +
  guides(colour = guide_legend(override.aes = list(size=2))) + 
  theme(axis.text.x = element_text(size = 32),  # Set X-axis label font size
        axis.text.y = element_text(size = 32)) +
  theme(axis.title.x = element_text(size = 32),
        axis.title.y = element_text(size = 32)) +
  theme(legend.text = element_text(size = 32))

yy

################################################################################################
# # Create PCA with eigenvectors
# Extract principal component scores
pc_scores2 <- pca$x
# Extract eigenvectors
eigenvectors <- pca$rotation
# Calculate the percentage variance explained by each principal component
total_variance <- sum(pca$sdev^2)
pc_var_percent <- round(100 * (pca$sdev^2) / total_variance, 1)

# Convert X8.day.seascapes to a factor
filt_df_pca$X8.day.seascapes <- as.factor(filt_df_pca$X8.day.seascapes)

# # For hydrography
qq <- ggplot(filt_df_pca, aes(x = pc_scores2[,1], y = pc_scores2[,2], color = X8.day.seascapes)) +
  geom_point(size = 3, alpha = 0.7) +
  scale_color_manual(values = custom_colors_pca) +
  geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 1], yend = eigenvectors[2, 1]),
               arrow = arrow(length = unit(0.1, "inches")), color = "black") +  # Add vector for PC1
  geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 2], yend = eigenvectors[2, 2]),
               arrow = arrow(length = unit(0.1, "inches")), color = "black") + # Add vector for PC2
  geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 3], yend = eigenvectors[2, 3]),
               arrow = arrow(length = unit(0.1, "inches")), color = "black") + # Add vector for PC3
  geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 4], yend = eigenvectors[2, 4]),
               arrow = arrow(length = unit(0.1, "inches")), color = "black") + # Add vector for PC4
  geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 5], yend = eigenvectors[2, 5]),
               arrow = arrow(length = unit(0.1, "inches")), color = "black") + # Add vector for PC5
  geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 6], yend = eigenvectors[2, 6]),
               arrow = arrow(length = unit(0.1, "inches")), color = "black") + # Add vector for PC6
  geom_text(aes(x = eigenvectors[1, 1], y = eigenvectors[2, 1], 
                label = paste("PC1 (", pc_var_percent[1], "%: Salinity)", sep = "")),
            vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC1
  geom_text(aes(x = eigenvectors[1, 2], y = eigenvectors[2, 2], 
                label = paste("PC2 (", pc_var_percent[2], "%: Chla)", sep = "")),
            vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC2
  geom_text(aes(x = eigenvectors[1, 3], y = eigenvectors[2, 3], 
                label = paste("PC3 (", pc_var_percent[3], "%: Phosphate)", sep = "")),
            vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC3
  geom_text(aes(x = eigenvectors[1, 4], y = eigenvectors[2, 4], 
                label = paste("PC4 (", pc_var_percent[4], "%: NOx)", sep = "")),
            vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC4
  geom_text(aes(x = eigenvectors[1, 5], y = eigenvectors[2, 5], 
                label = paste("PC5 (", pc_var_percent[5], "%: Ammonium)", sep = "")),
            vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC5
  geom_text(aes(x = eigenvectors[1, 6], y = eigenvectors[2, 6], 
                label = paste("PC6 (", pc_var_percent[6], "%: Temperature)", sep = "")),
            vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC6
  labs(x = "PC1", y = "PC2", color = "Seascape class") +
  xlim(-1.5, 1.5) +
  ylim(-1.5, 1.5) +
  guides(colour = guide_legend(override.aes = list(size=5))) + 
  theme_classic() +
  theme(axis.text.x = element_text(size = 18),  # Set X-axis label font size
        axis.text.y = element_text(size = 18)) +
  theme(axis.title.x = element_text(size = 18),
        axis.title.y = element_text(size = 18)) +
  theme(legend.text = element_text(size = 18)) +
  theme(legend.title = element_text(size = 18))
qq


# # # For phytoplankton
# qq2 <- ggplot(filt_df_pca, aes(x = pc_scores2[,1], y = pc_scores2[,3], color = X8.day.seascapes)) +
#   geom_point(size = 4) +
#   scale_color_manual(values = custom_colors_pca) +
#   geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 1], yend = eigenvectors[2, 1]),
#                arrow = arrow(length = unit(0.2, "inches")), color = "black") +  # Add vector for PC1
#   geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 2], yend = eigenvectors[2, 2]),
#                arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC2
#   geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 3], yend = eigenvectors[2, 3]),
#                arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC3
#   geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 4], yend = eigenvectors[2, 4]),
#                arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC4
#   geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 5], yend = eigenvectors[2, 5]),
#                arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC5
#   geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 6], yend = eigenvectors[2, 6]),
#                arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC6
#   geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 7], yend = eigenvectors[2, 7]),
#                arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC7
#   geom_text(aes(x = eigenvectors[1, 1], y = eigenvectors[2, 1], 
#                 label = paste("PC1 (", pc_var_percent[1], "%: Ceratium)", sep = "")),
#             vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC1
#   geom_text(aes(x = eigenvectors[1, 2], y = eigenvectors[2, 2], 
#                 label = paste("PC2 (", pc_var_percent[2], "%: Chaetoceros)", sep = "")),
#             vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC2
#   geom_text(aes(x = eigenvectors[1, 3], y = eigenvectors[2, 3], 
#                 label = paste("PC3 (", pc_var_percent[3], "%: Diatoms)", sep = "")),
#             vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC3
#   geom_text(aes(x = eigenvectors[1, 4], y = eigenvectors[2, 4], 
#                 label = paste("PC4 (", pc_var_percent[4], "%: Diatoms2)", sep = "")),
#             vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC4
#   geom_text(aes(x = eigenvectors[1, 5], y = eigenvectors[2, 5], 
#                 label = paste("PC5 (", pc_var_percent[5], "%: Guinardia)", sep = "")),
#             vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC5
#   geom_text(aes(x = eigenvectors[1, 6], y = eigenvectors[2, 6], 
#                 label = paste("PC6 (", pc_var_percent[6], "%: Neocalyptrella)", sep = "")),
#             vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC6
#   geom_text(aes(x = eigenvectors[1, 7], y = eigenvectors[2, 7], 
#                 label = paste("PC7 (", pc_var_percent[7], "%: Trichodesmium)", sep = "")),
#             vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC7
#   labs(x = "PC1", y = "PC3", color = "X8.day.seascapes") +
#   xlim(-1, 1) +
#   ylim(-1, 1) +
#   guides(colour = guide_legend(override.aes = list(size=2))) + 
#   theme_classic() +
#   theme(axis.text.x = element_text(size = 32),  # Set X-axis label font size
#         axis.text.y = element_text(size = 32)) +
#   theme(axis.title.x = element_text(size = 32),
#         axis.title.y = element_text(size = 32)) +
#   theme(legend.text = element_text(size = 32)) 
# qq2
# 
# 
# # # For zooplankton
# qq3 <- ggplot(filt_df_pca, aes(x = pc_scores2[,1], y = pc_scores2[,2], color = X8.day.seascapes)) +
#   geom_point(size = 4) +
#   scale_color_manual(values = custom_colors_pca) +
#   geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 1], yend = eigenvectors[2, 1]),
#                arrow = arrow(length = unit(0.2, "inches")), color = "black") +  # Add vector for PC1
#   geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 2], yend = eigenvectors[2, 2]),
#                arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC2
#   geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 3], yend = eigenvectors[2, 3]),
#                arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC3
#   geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 4], yend = eigenvectors[2, 4]),
#                arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC4
#   geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 5], yend = eigenvectors[2, 5]),
#                arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC5
#   geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 6], yend = eigenvectors[2, 6]),
#                arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC6
#   geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 7], yend = eigenvectors[2, 7]),
#                arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC7
#   geom_segment(aes(x = 0, y = 0, xend = eigenvectors[1, 8], yend = eigenvectors[2, 8]),
#                arrow = arrow(length = unit(0.2, "inches")), color = "black") + # Add vector for PC8
#   geom_text(aes(x = eigenvectors[1, 1], y = eigenvectors[2, 1], 
#                 label = paste("PC1 (", pc_var_percent[1], "%: Acantharea)", sep = "")),
#             vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC1
#   geom_text(aes(x = eigenvectors[1, 2], y = eigenvectors[2, 2], 
#                 label = paste("PC2 (", pc_var_percent[2], "%: Copepods)", sep = "")),
#             vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC2
#   geom_text(aes(x = eigenvectors[1, 3], y = eigenvectors[2, 3], 
#                 label = paste("PC3 (", pc_var_percent[3], "%: Echinoderms)", sep = "")),
#             vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC3
#   geom_text(aes(x = eigenvectors[1, 4], y = eigenvectors[2, 4], 
#                 label = paste("PC4 (", pc_var_percent[4], "%: Jellies)", sep = "")),
#             vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC4
#   geom_text(aes(x = eigenvectors[1, 5], y = eigenvectors[2, 5], 
#                 label = paste("PC5 (", pc_var_percent[5], "%: Larvaceans)", sep = "")),
#             vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC5
#   geom_text(aes(x = eigenvectors[1, 6], y = eigenvectors[2, 6], 
#                 label = paste("PC6 (", pc_var_percent[6], "%: Polychaetes)", sep = "")),
#             vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC6
#   geom_text(aes(x = eigenvectors[1, 7], y = eigenvectors[2, 7], 
#                 label = paste("PC7 (", pc_var_percent[7], "%: Chaetognaths)", sep = "")),
#             vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC7
#   geom_text(aes(x = eigenvectors[1, 8], y = eigenvectors[2, 8], 
#                 label = paste("PC8 (", pc_var_percent[8], "%: Pteropods)", sep = "")),
#             vjust = -0.5, hjust = 0.5, color = "black") +  # Add label for PC8
#   labs(x = "PC1", y = "PC2", color = "X8.day.seascapes") +
#   xlim(-1, 1) +
#   ylim(-1, 1) +
#   guides(colour = guide_legend(override.aes = list(size=2))) + 
#   theme_classic() +
#   theme(axis.text.x = element_text(size = 32),  # Set X-axis label font size
#         axis.text.y = element_text(size = 32)) +
#   theme(axis.title.x = element_text(size = 32),
#         axis.title.y = element_text(size = 32)) +
#   theme(legend.text = element_text(size = 32)) 
# qq3

```


# Map overall WEIGHTED mean concentration values of selected taxa by transect
```{r}
library(ggOceanMaps)
library(leaflet)
library(patchwork)

# For zooplankton
taxa_meta_concentration <- taxa_meta %>%
  mutate(across(c(Acantharea,Chaetognaths,Ostracods,Copepods,Decapods,Echinoderms,Jellies,
    Larvaceans,Polychaets,Pteropods), ~ ./total_vol_sampled * 1e6)) %>%
  select(Station, dec_lat, dec_lon, year, month, date, Avg.chl.a..ug.L.,
         temp..degC., salinity, total_vol_sampled,
         Acantharea,Chaetognaths,Ostracods,Copepods,Decapods,Echinoderms,
         Jellies,Larvaceans,Polychaets,Pteropods) %>%
  filter(!is.na(total_vol_sampled))

# # For phytoplankton
# taxa_meta_concentration <- taxa_meta %>%
#   mutate(across(c(Centric,Ceratium,Chaetoceros,Chain2,Chain3,Guinardia,Neocalyptrella,
#                   Noctiluca,Tricho), ~ ./total_vol_sampled * 1e6)) %>%
#   select(Station, dec_lat, dec_lon, year, month, date, Avg.chl.a..ug.L.,
#          temp..degC., salinity,total_vol_sampled,
#          Centric,Ceratium,Chaetoceros,Chain2,Chain3,Guinardia,Neocalyptrella,
#                   Noctiluca,Tricho) %>%
#   filter(!is.na(total_vol_sampled))

# List of class names to loop through
class_names <- c("Acantharea", "Chaetognaths", "Ostracods", "Copepods", "Decapods", "Echinoderms", "Jellies", "Larvaceans", "Polychaets", "Pteropods")

class_names <- "Acantharea"

path_sfer_list <- "~/Library/CloudStorage/GoogleDrive-enriquemontes01@gmail.com/My Drive/GDrive/proposals/2022_02_MultiStressor_NOAA/module_1"

sfer_curated <- list.files(path_sfer_list, pattern = "sfer_stations_curated.csv", full.names = TRUE)
sfer_sta_list <- read.csv(sfer_curated, header = TRUE)

# Loop over each class name to generate maps
# for (class_name in class_names) {

    # # Filter concentration data for the current class and merges curated station list with the concentration df
    concentration_df <- taxa_meta_concentration %>%
      left_join(sfer_sta_list %>% filter(station_class %in% "C"), by = c('Station' = 'station_id'))
    
    # Compute weighted average lat/lons and mean variable values for the current class
    taxa_concentration_avg <- concentration_df %>%
      group_by(line_id) %>%
      mutate(weight = get(class_name) / sum(get(class_name), na.rm = TRUE)) %>%
      summarise(longitude = weighted.mean(dec_lon, w = weight, na.rm = TRUE),
                latitude = weighted.mean(dec_lat, w = weight, na.rm = TRUE),
                sel_taxa_mean = mean(get(class_name), na.rm = TRUE),
                sel_taxa_sd = sd(get(class_name), na.rm = TRUE))

    # Prepare data for mapping
    dt <- data.frame(
      lon = taxa_concentration_avg$longitude, 
      lat = taxa_concentration_avg$latitude, 
      mean_param = taxa_concentration_avg$sel_taxa_mean,
      sd_param = taxa_concentration_avg$sel_taxa_sd)
    
    # Add station markers
    station_markers <- sfer_sta_list %>% filter(station_class %in% "C")
    
    # Create the map
    concentration_map <- basemap(limits = c(-86, -79.5, 24, 28.5), bathymetry = TRUE) +
      geom_point(data = dt, aes(x = lon, y = lat, size = mean_param, color = sd_param)) +
      scale_color_gradient(low = "yellow", high = "red", na.value = NA, name = "Standard Deviation") +
      geom_point(data = station_markers, aes(x = mean_lon, y = mean_lat), color = "black", shape = 3, size = 1.5) +
      scale_size_continuous(name = "Average concentration", 
                            breaks = c(250, 500, 1000),
                            range = c(0, 10),
                            guide = guide_legend(override.aes = list(color = "black", fill = "white"))) +
    theme_minimal() +
    ggtitle(paste(class_name))
      
    # Optionally, save each map as an image
    ggsave(paste0("concentration_map_", class_name, ".png"), plot = concentration_map, width = 8, height = 6)
# }
```


# Map overall mean concentration values of selected taxa per station
```{r}
# Compute mean concentration of selected groups per station
taxa_concentration_avg_station <- taxa_meta_concentration %>%
  group_by(Station) %>%
  summarise(longitude_station = mean(dec_lon, na.rm = TRUE),
            latitude_station = mean(dec_lat, na.rm = TRUE),
            sel_taxa_mean_station = mean(Tricho, na.rm = TRUE),
            sel_taxa_sd_station = sd(Tricho, na.rm = TRUE))

# # Map with bathymetry
  # Overlay color-scaled dots
dt_station <- data.frame(
  lon_station = taxa_concentration_avg_station$longitude_station, 
  lat_station = taxa_concentration_avg_station$latitude_station, 
  mean_param_station = taxa_concentration_avg_station$sel_taxa_mean_station,
  sd_param_station = taxa_concentration_avg_station$sel_taxa_sd_station)

# Replace NA by zeros
dt_station <- dt_station %>%
  mutate(sd_param_station = replace_na(sd_param_station, 0))

concentration_map_station <- basemap(limits = c(-86, -79.5, 24, 28.5), bathymetry = TRUE) +
  geom_point(data = dt_station, aes(x = lon_station, 
                                    y = lat_station, 
                                    size = mean_param_station, 
                                    color = sd_param_station)) +
  scale_color_gradient(low = "yellow", high = "red", na.value = NA, name = "Standard Deviation") +
  geom_point(data = station_markers, aes(x = mean_lon, y = mean_lat), color = "black", shape = 3, size = 1.5) +
  scale_size_continuous(name = "Average concentration", 
                        breaks = c(250, 500, 750),
                        range = c(1, 10),
                        guide = guide_legend(override.aes = list(color = "black", fill = "white"))) +
  theme_minimal() 
concentration_map_station
```


# Map mean count values of selected taxa and specified dates
```{r}
# # ggOceanMaps:
# https://biostats-r.github.io/biostats/workingInR/140_maps.html
# https://github.com/MikkoVihtakari/ggOceanMaps: Use this one: remotes::install_github("MikkoVihtakari/ggOceanMaps")
# install.packages("ggOceanMaps") # # This is outdated, don't use!!!
library(ggOceanMaps)
library(leaflet)

# For NOAA machines
# path_sfer_list <- "/Users/enrique.montes/Library/CloudStorage/GoogleDrive-enriquemontes01@gmail.com/My Drive/GDrive/proposals/2022_02_MultiStressor_NOAA/module_1"
# For personal machine
path_sfer_list <- "~/Library/CloudStorage/GoogleDrive-enriquemontes01@gmail.com/My Drive/GDrive/proposals/2022_02_MultiStressor_NOAA/module_1"

sfer_curated <- list.files(path_sfer_list, pattern = "sfer_stations_curated.csv", full.names = TRUE)

sfer_sta_list <- read.csv(sfer_curated, header = TRUE)

merged_data <- taxa_meta %>%
  left_join(sfer_sta_list %>% filter(station_class %in% "C"), by = c('Station' = 'station_id'))

# Replace 2023 and 10 with your selected year and month
selected_year <- 2023
selected_month <- 11

# Filter rows
filtered_taxa_meta <- merged_data %>%
  filter(year == selected_year, month == selected_month)

# filtered_taxa_meta <- merged_data

# Compute weighted average lat lons based on selected variable, and mean variable values (e.g., Copepods)
taxa_avg <- filtered_taxa_meta %>%
  group_by(line_id) %>%
  mutate(weight = Guinardia / sum(Guinardia, na.rm = TRUE)) %>%
  summarise(longitude = weighted.mean(dec_lon, w = weight, na.rm = TRUE),
            latitude = weighted.mean(dec_lat, w = weight, na.rm = TRUE),
            sel_taxa_mean = mean(Guinardia, na.rm = TRUE))

# # Filter out values less than 1
filtered_taxa_avg <- taxa_avg %>%
  filter(sel_taxa_mean > 0)


# Find the minimum and maximum values of sel_taxa_mean
min_value <- min(filtered_taxa_avg$sel_taxa_mean, na.rm = TRUE)
max_value <- max(filtered_taxa_avg$sel_taxa_mean, na.rm = TRUE)

# # Adjust the color palette with specified values
# color_palette <- colorNumeric(palette = "Oranges", domain = c(min_value, max_value))

# # Create the map
# map <- leaflet(filtered_taxa_avg) %>%
#   addProviderTiles("Esri.WorldImagery") %>%
#   addCircleMarkers(
#     lng = ~longitude,
#     lat = ~latitude,
#     radius = 10,
#     color = ~color_palette(sel_taxa_mean),
#     fillOpacity = 0.8,
#     popup = ~paste("Line ID: ", line_id, "<br>Copepods: ", sel_taxa_mean)
#   ) %>%
#   addLegend(
#     position = "bottomright",
#     pal = color_palette,
#     values = c(min_value, max_value),  # Adjusted values here
#     title = "Copepod occurrences",
#     opacity = 1
#   )
# map

# # Map with bathymetry
  # Overlay color-scaled dots
dt <- data.frame(
  lon = filtered_taxa_avg$longitude, 
  lat = filtered_taxa_avg$latitude, 
  sel_param = filtered_taxa_avg$sel_taxa_mean)

# # Colors:
# Acantharea = darkturquoise; breaks = c(0.25, 1, 2.5, 5); limits = c(0.1, 10)
# Copepods = red; breaks = c(0.25, 1, 2.5, 5, 10); limits = c(0.1, 15)
# Chain diatoms = orange; breaks = c(0.5, 1, 10, 30, 60); c(0.1, 70)
# Chaetoceros = purple; breaks = c(0.25, 1, 2.5, 5, 10); limits = c(0.1, 15)
# Echinoderms = magenta; breaks = c(0.5, 1, 3, 5); limits = c(0.1, 5)
# Larvaceans = plum; breaks = c(0.5, 1, 3, 5); limits = c(0.1, 5)
# Polychaetes = dodgerblue; breaks = c(0.25, 0.5, 1, 3); limits = c(0.1, 3)
# Jellies = slategray4; breaks = c(0.25, 0.5, 1, 2); limits = c(0.1, 3)
    
occ_map <- basemap(limits = c(-84, -79.5, 24, 28.5), bathymetry = TRUE) +
  geom_point(data = dt, aes(x = lon, y = lat, size = sel_param), fill = "olivedrab2", color = "olivedrab2") +
  geom_point(data = filtered_taxa_meta, aes(x = dec_lon, y = dec_lat), color = "black", shape = 3, size = 1.5) +
  scale_size_continuous(name = "Average counts", 
                        breaks = c(0.5, 1, 3, 5, 10),
                        limits = c(0.1, 40),
                        range = c(1, 15)) +
  theme_minimal() 
occ_map

# occ_map_leg <- ggplot(dt, aes(lon, lat)) +
#   geom_point(aes(size = sel_param), fill = "green", color ="green") +
#   theme_minimal()
# occ_map_leg
```


# Create time-series plots of selected taxa at specific stations
```{r}
selected_variable <- 'Chaetoceros'
# # Lower Keys
# selected_line_id <- c('LK','WS','MO','KW')
# # Middle Keys
# selected_line_id <- c('CO', 'CR')
# # Other regions
selected_line_id <- c('CAL')

# Filter the data for the selected line_id
filtered_merged_data <- merged_data[merged_data$line_id %in% selected_line_id, ]
# Remove rows with NAs in relevant columns
monthly_means <- na.omit(filtered_merged_data[, c("date", selected_variable, "line_id")]) %>%
  group_by(year_month = format(date, "%Y-%m")) %>%
  summarise(mean_value = mean(!!sym(selected_variable), na.rm = TRUE),
             se_value = sd(!!sym(selected_variable), na.rm = TRUE) / sqrt(n()))

# Create a time series plot
ggplot(monthly_means, aes(x = year_month, y = mean_value)) +
  geom_bar(stat = "identity", fill = "orange") +
  geom_errorbar(aes(ymin = mean_value - se_value, ymax = mean_value + se_value),
                width = 0.2, # Adjust width as needed
                position = position_dodge(width = 0.8)) + # Adjust width as needed
  labs(x = "Year-Month", y = "Mean Value") +
  ggtitle("Monthly Means of Selected Variable") +
  theme_minimal()

```






# Match file name with string ID - DO NOT USE
```{r}
# library(tidyverse)
# library(lubridate)
# library(dplyr)
# 
# # extract relevant part of the strings
# df <- rbind(class.Copepods,
#             class.Eucampia,
#             class.Noctiluca,
#             class.Polychaets,
#             class.Acantharea,
#             class.Centric,
#             class.Ceratium,
#             class.Chaetoceros,
#             class.Chain2,
#             class.Chain3,
#             class.Chain4,
#             class.Ostracods,
#             class.Jellies,
#             class.Larvaceans,
#             class.pellets)
# sub_strings <- substr(df$V1, start = 10, stop = 22)
# unique_all <- unique(sub_strings)
# 
# # select unique dates (this allows to search CTD records per date and time)
# # To find unique dates and times to extract CDT data use: unique_all[grepl("20221209", unique_all)]
# 
# id_list <- unique_all
# id_list2  <- as.POSIXct(id_list, format="%Y%m%d_%H%M", tz="UTC")
# 
# conc_occ_count <- data.frame(date = as.Date(character()), stringsAsFactors = FALSE)
# 
# for ( i in seq_along(id_list)){
#   acantha <- as.data.frame(str_count(class.Acantharea$V1, id_list[i]))
#   centric <- as.data.frame(str_count(class.Centric$V1, id_list[i]))
#   ceratium <- as.data.frame(str_count(class.Ceratium$V1, id_list[i]))
#   chaetoceros <- as.data.frame(str_count(class.Chaetoceros$V1, id_list[i]))
#   chaetog <- as.data.frame(str_count(class.Chaetognaths$V1, id_list[i]))
#   chain2 <- as.data.frame(str_count(class.Chain2$V1, id_list[i]))
#   chain3 <- as.data.frame(str_count(class.Chain3$V1, id_list[i]))
#   chain4 <- as.data.frame(str_count(class.Chain4$V1, id_list[i]))
#   ostra <- as.data.frame(str_count(class.Ostracods$V1, id_list[i]))
#   copepods <- as.data.frame(str_count(class.Copepods$V1, id_list[i]))
#   decapod <- as.data.frame(str_count(class.Decapods$V1, id_list[i]))
#   echino <- as.data.frame(str_count(class.Echinoderms$V1, id_list[i]))
#   eucampia <- as.data.frame(str_count(class.Eucampia$V1, id_list[i]))
#   jellies <- as.data.frame(str_count(class.Jellies$V1, id_list[i]))
#   larvae <- as.data.frame(str_count(class.Larvaceans$V1, id_list[i]))
#   nocti <- as.data.frame(str_count(class.Noctiluca$V1, id_list[i]))
#   polychaetes <- as.data.frame(str_count(class.Polychaets$V1, id_list[i]))
#   tricho <- as.data.frame(str_count(class.Tricho$V1, id_list[i]))
# 
#   Acantharea <- colSums(acantha != 0)
#   Centric <- colSums(centric != 0)
#   Ceratium_spp <- colSums(ceratium != 0)
#   Chaetoceros <- colSums(chaetoceros != 0)
#   Chaetognaths <- colSums(chaetog != 0)
#   Diatom_chains_1 <- colSums(chain2 != 0)
#   Diatom_chains_2 <- colSums(chain3 != 0)
#   Diatom_chains_3 <- colSums(chain4 != 0)
#   Ostracods <- colSums(ostra != 0)
#   Copepods <- colSums(copepods != 0)
#   Decapods <- colSums(decapod != 0)
#   Echinoderms <- colSums(echino != 0)
#   Eucampia_spp <- colSums(eucampia != 0)
#   Jellies<- colSums(jellies != 0)
#   Larvaceans <- colSums(larvae != 0)
#   Noctiluca <- colSums(nocti != 0)
#   Polychaetes <- colSums(polychaetes != 0)
#   Trichodesmium_spp <- colSums(tricho != 0)
# 
#   # Parse the date-time string with ymd_hm()
#   occ_datetime  <- as.POSIXct(id_list[i], format="%Y%m%d_%H%M", tz="UTC")
#   occ_datetime_str <- substr(id_list[i], 1, 13)
# 
#   row_df <- data.frame(date = occ_datetime, occ_datetime_str,
#                        Acantharea,
#                        Centric,
#                        Ceratium_spp,
#                        Chaetoceros,
#                        Chaetognaths,
#                        Diatom_chains_1,
#                        Diatom_chains_2,
#                        Diatom_chains_3,
#                        Ostracods,
#                        Copepods,
#                        Decapods,
#                        Echinoderms,
#                        Eucampia_spp,
#                        Jellies,
#                        Larvaceans,
#                        Noctiluca,
#                        Polychaetes,
#                        Trichodesmium_spp
#                        )
#   rownames(row_df) <- i
# 
#   conc_occ_count <- rbind(conc_occ_count, row_df)
# }
# 
# conc_occ_final <- arrange(conc_occ_count, date)

```


# Match image records with CTD metadata and seascapes (use with strings matching) - DO NOT USE
```{r}
# # Directory where the CTD metadata is located
# dir_path2 <- "~/enriquemontes01@gmail.com - Google Drive/My Drive/GDrive/OCED_AOML/WS_cruises/plankton_imaging/CPICS/ws_cruise_ctd/"
# file_name <- list.files(path = dir_path2, pattern = ".csv", full.names = TRUE)
# ctd_meta <- read.csv(file_name, fill = TRUE)
# 
# dt_list <- ctd_meta$GMT.datetime
# 
# conc_event <- data.frame()
# 
# for ( t in seq_along(dt_list)){
#   event <- str_count(conc_occ_final$occ_datetime_str, dt_list[t])
#   idx_event <- which(event == 1, arr.ind = TRUE)
#   occ_row <- conc_occ_final[idx_event, ]
#   event_meta <- ctd_meta[idx_event, ]
#   conc_event <- rbind(conc_event, occ_row)
# }
# 
# taxa_meta <- cbind(ctd_meta, conc_event)
```
